Skip to content

Commit

Permalink
Merge branch 'main' into debug_radixcache_stack_overflow
Browse files Browse the repository at this point in the history
  • Loading branch information
luzengxiangcn committed Jan 23, 2025
2 parents af2e328 + 7353fb9 commit 2cc5089
Show file tree
Hide file tree
Showing 27 changed files with 1,072 additions and 240 deletions.
16 changes: 8 additions & 8 deletions .github/workflows/pr-test-sgl-kernel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,16 @@ jobs:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Install dependencies
- name: Install
run: |
bash scripts/ci_install_dependency.sh
pip3 install torch==2.5.1 && pip3 install pytest && pip3 install vllm
pip3 uninstall sgl-kernel -y || true
find . -name index.lock -delete
cd sgl-kernel
git submodule update --init --recursive
pip3 install -e . --force-reinstall
git submodule deinit --all --force && git submodule sync --recursive && git submodule update --init --force --recursive
pip3 install .
pip3 list | grep sgl-kernel
- name: Run test
Expand All @@ -57,7 +57,7 @@ jobs:
pip3 uninstall sgl-kernel -y
finish:
needs: [unit-test]
needs: [unit-test, lint]
runs-on: ubuntu-latest
steps:
- name: Finish
Expand Down
18 changes: 16 additions & 2 deletions .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ jobs:
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_default
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1
- name: Benchmark online latency
timeout-minutes: 10
Expand All @@ -148,6 +148,13 @@ jobs:
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
- name: Benchmark online latency (EAGLE)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
performance-test-1-gpu-part-2:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner
Expand Down Expand Up @@ -196,7 +203,13 @@ jobs:
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_default
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
- name: Benchmark single latency + torch.compile (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
- name: Benchmark offline throughput (TP=2)
timeout-minutes: 10
Expand All @@ -210,6 +223,7 @@ jobs:
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
accuracy-test-1-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,5 @@ compile_commands.json

# VSCode
.vscode

1
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
[submodule "sgl-kernel/3rdparty/cccl"]
path = sgl-kernel/3rdparty/cccl
url = https://github.com/NVIDIA/cccl.git
[submodule "sgl-kernel/3rdparty/flashinfer"]
path = sgl-kernel/3rdparty/flashinfer
url = https://github.com/flashinfer-ai/flashinfer.git
8 changes: 3 additions & 5 deletions python/sglang/bench_one_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
parser.add_argument("--correctness-test", action="store_true")
parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
parser.add_argument(
"--profile",
action="store_true",
help="Use Torch Profiler. The endpoint must be launched with "
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
"--profile", action="store_true", help="Use Torch Profiler."
)
parser.add_argument(
"--profile-filename-prefix",
Expand Down Expand Up @@ -381,6 +378,7 @@ def latency_test_run_once(
parent_dir = os.path.dirname(os.path.abspath(profile_filename))
os.makedirs(parent_dir, exist_ok=True)
profiler.export_chrome_trace(profile_filename)
rank_print(f"torch profiler chrome trace saved to {profile_filename}")

# Record decode timing from 2nd output
if output_len > 1:
Expand Down Expand Up @@ -451,7 +449,7 @@ def latency_test(
il,
ol,
server_args.device,
bench_args.profile,
bench_args.profile if tp_rank == 0 else None,
bench_args.profile_filename_prefix,
)
if ret is not None:
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/layers/logits_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def fused_softcap_kernel(
n_elements,
BLOCK_SIZE: tl.constexpr,
):
pid = tl.program_id(0)
pid = tl.program_id(0).to(tl.int64)
block_start = pid * BLOCK_SIZE
offsets = block_start + tl.arange(0, BLOCK_SIZE)
mask = offsets < n_elements
Expand Down
7 changes: 2 additions & 5 deletions python/sglang/srt/layers/sampler.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import logging
from typing import Dict, List
from typing import List

import torch
from torch import nn

from sglang.srt.layers.logits_processor import LogitsProcessorOutput
from sglang.srt.managers.schedule_batch import global_server_args_dict
from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
from sglang.srt.utils import crash_on_warnings, is_flashinfer_available

Expand Down Expand Up @@ -109,8 +108,6 @@ def forward(
f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
)

batch_next_token_ids = batch_next_token_ids.to(torch.int32)

# Attach logprobs to logits_output (in-place modification)
if return_logprob:
if any(x > 0 for x in top_logprobs_nums):
Expand All @@ -124,7 +121,7 @@ def forward(
batch_next_token_ids,
]

return batch_next_token_ids
return batch_next_token_ids.to(torch.int32)

def _apply_custom_logit_processor(
self, logits: torch.Tensor, sampling_batch_info: SamplingBatchInfo
Expand Down
14 changes: 7 additions & 7 deletions python/sglang/srt/managers/schedule_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,13 +550,13 @@ class ScheduleBatch:
next_batch_sampling_info: SamplingBatchInfo = None

# Batched arguments to model runner
input_ids: torch.Tensor = None
input_embeds: torch.Tensor = None
req_pool_indices: torch.Tensor = None
seq_lens: torch.Tensor = None
input_ids: torch.Tensor = None # shape: [b], int32
input_embeds: torch.Tensor = None # shape: [b, hidden_size], float32
req_pool_indices: torch.Tensor = None # shape: [b], int32
seq_lens: torch.Tensor = None # shape: [b], int64
# The output locations of the KV cache
out_cache_loc: torch.Tensor = None
output_ids: torch.Tensor = None
out_cache_loc: torch.Tensor = None # shape: [b], int32
output_ids: torch.Tensor = None # shape: [b], int32

# The sum of all sequence lengths
seq_lens_sum: int = None
Expand Down Expand Up @@ -1026,7 +1026,7 @@ def prepare_for_idle(self):
self.input_ids = torch.empty(0, dtype=torch.int32, device=self.device)
self.seq_lens = torch.empty(0, dtype=torch.int64, device=self.device)
self.out_cache_loc = torch.empty(0, dtype=torch.int32, device=self.device)
self.req_pool_indices = torch.empty(0, dtype=torch.int64, device=self.device)
self.req_pool_indices = torch.empty(0, dtype=torch.int32, device=self.device)
self.seq_lens_sum = 0
self.extend_num_tokens = 0
self.sampling_info = SamplingBatchInfo.from_schedule_batch(
Expand Down
Loading

0 comments on commit 2cc5089

Please sign in to comment.