Merge branch 'main' into llavavid_fix

sgl-project · Feb 25, 2025 · c657b13 · c657b13
2 parents 9cf2783 + 6ce9dbe
commit c657b13
Show file tree

Hide file tree

Showing 45 changed files with 3,751 additions and 1,412 deletions.
diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
@@ -90,7 +90,7 @@ jobs:
       - name: MLA TEST
         timeout-minutes: 20
         run: |
-          docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_mla.py
+          docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_mla.py TestMLA
 
   finish:
     needs: [

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -107,19 +107,6 @@ jobs:
           bash scripts/ci_install_dependency.sh
 
       - name: Run test
-        if: github.event.pull_request.head.repo.fork == false
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        timeout-minutes: 30
-        run: |
-          RANGE=${{ matrix.range }}
-          range_begin=${RANGE%-*}
-          range_end=${RANGE#*-}
-          cd test/srt
-          python3 run_suite.py --suite per-commit --range-begin ${range_begin} --range-end ${range_end}
-
-      - name: Run test (fork)
-        if: github.event.pull_request.head.repo.fork == true
         timeout-minutes: 30
         run: |
           RANGE=${{ matrix.range }}

diff --git a/docs/backend/function_calling.ipynb b/docs/backend/function_calling.ipynb
@@ -426,7 +426,7 @@
     "from sglang.srt.managers.io_struct import Tool, Function\n",
     "\n",
     "llm = sgl.Engine(model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
-    "tokenizer = llm.orchestrator.tokenizer\n",
+    "tokenizer = llm.tokenizer_manager.tokenizer\n",
     "input_ids = tokenizer.apply_chat_template(\n",
     "    messages, tokenize=True, add_generation_prompt=True, tools=tools\n",
     ")\n",

diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md
@@ -47,7 +47,7 @@ Please consult the documentation below to learn more about the parameters you ma
 * `trust_remote_code`:  If `True`, will use locally cached config files, otherwise use remote configs in HuggingFace.
 * `dtype`: Dtype used for the model, defaults to `bfloat16`.
 * `kv_cache_dtype`: Dtype of the kv cache, defaults to the `dtype`.
-* `context_length`: The number of tokens our model can process *including the input*. Not that extending the default might lead to strange behavior.
+* `context_length`: The number of tokens our model can process *including the input*. Note that extending the default might lead to strange behavior.
 * `device`: The device we put the model, defaults to `cuda`.
 * `chat_template`: The chat template to use. Deviating from the default might lead to unexpected responses. For multi-modal chat templates, refer to [here](https://docs.sglang.ai/backend/openai_api_vision.html#Chat-Template).
 * `is_embedding`: Set to true to perform [embedding](https://docs.sglang.ai/backend/openai_api_embeddings.html) / [encode](https://docs.sglang.ai/backend/native_api.html#Encode-(embedding-model)) and [reward](https://docs.sglang.ai/backend/native_api.html#Classify-(reward-model)) tasks.

diff --git a/docs/conf.py b/docs/conf.py
@@ -1,5 +1,6 @@
 import os
 import sys
+from datetime import datetime
 
 sys.path.insert(0, os.path.abspath("../.."))
 
@@ -9,7 +10,7 @@
 __version__ = locals()["__version__"]
 
 project = "SGLang"
-copyright = "2023-2024, SGLang"
+copyright = f"2023-{datetime.now().year}, SGLang"
 author = "SGLang Team"
 
 version = __version__

diff --git a/docs/router/router.md b/docs/router/router.md
@@ -10,7 +10,7 @@ The router is an independent Python package, and it can be used as a drop-in rep
 pip install sglang-router
 ```
 
-Detailed usage of the router can be found in [launch_router](https://github.com/sgl-project/sglang/blob/main/sgl-router/py_src/sglang_router/launch_router.py) and [launch_server](https://github.com/sgl-project/sglang/blob/main/sgl-router/py_src/sglang/launch_server.py). Also, you can directly run the following command to see the usage of the router.
+Detailed usage of the router can be found in [launch_router](https://github.com/sgl-project/sglang/blob/main/sgl-router/py_src/sglang_router/launch_router.py) and [launch_server](https://github.com/sgl-project/sglang/blob/main/sgl-router/py_src/sglang_router/launch_server.py). Also, you can directly run the following command to see the usage of the router.
 
 ```bash
 python -m sglang_router.launch_server --help

diff --git a/docs/start/install.md b/docs/start/install.md
@@ -1,26 +1,24 @@
 # Install SGLang
 
-You can install SGLang using any of the methods below. For running DeepSeek V3/R1 with SGLang, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is always recommended to use the [latest release version](https://pypi.org/project/sglang/#history) and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid fixed issues and environment-related problems.
+You can install SGLang using any of the methods below.
 
-## Method 1: With pip or uv
+For running DeepSeek V3/R1, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is recommended to use the [latest version](https://pypi.org/project/sglang/#history) and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid environment-related problems.
 
-We recommend using uv to install the dependencies with a higher installation speed:
+## Method 1: With pip
 
 ```bash
 pip install --upgrade pip
-pip install uv
-uv pip install sgl-kernel --force-reinstall --no-deps
-uv pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
+pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
 ```
 
-**Quick Fix to Installation**
+**Quick Fixes to Installation**
 
-- SGLang currently uses torch 2.5, so you need to install the flashinfer version for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the package currently used by FlashInfer is named `flashinfer-python`, not `flashinfer`.
+- SGLang currently uses torch 2.5, so you need to install flashinfer for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the FlashInfer pypi package is called `flashinfer-python` instead of `flashinfer`.
 
-- If you experience an error like `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions:
+- If you encounter `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions:
 
 1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
-2. Follow the procedure described in [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) first, then install SGLang as described above.
+2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above.
 
 - If you encounter `ImportError; cannot import name 'is_valid_list_of_images' from 'transformers.models.llama.image_processing_llama'`, try to use the specified version of `transformers` in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/python/pyproject.toml). Currently, just running `pip install transformers==4.48.3`.
 
@@ -31,15 +29,14 @@ git clone -b v0.4.3.post2 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
-pip install sgl-kernel --force-reinstall --no-deps
 pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
 ```
 
-Note: SGLang currently uses torch 2.5, so you need to install the flashinfer version for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html).
+Note: SGLang currently uses torch 2.5, so you need to install flashinfer for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html).
 
-If you want to work on development in SGLang, it is highly recommended that you use docker. Please refer to [setup docker container](https://github.com/sgl-project/sglang/blob/main/docs/developer/development_guide_using_docker.md#setup-docker-container) for guidance. The image used is `lmsysorg/sglang:dev`.
+If you want to develop SGLang, it is recommended to use docker. Please refer to [setup docker container](https://github.com/sgl-project/sglang/blob/main/docs/developer/development_guide_using_docker.md#setup-docker-container) for guidance. The docker image is `lmsysorg/sglang:dev`.
 
-Note: To AMD ROCm system with Instinct/MI GPUs, do following instead:
+Note: For AMD ROCm system with Instinct/MI GPUs, do following instead:
 
 ```
 # Use the last release branch
@@ -68,7 +65,7 @@ docker run --gpus all \
     python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 ```
 
-Note: To AMD ROCm system with Instinct/MI GPUs, it is recommended to use `docker/Dockerfile.rocm` to build images, example and usage as below:
+Note: For AMD ROCm system with Instinct/MI GPUs, it is recommended to use `docker/Dockerfile.rocm` to build images, example and usage as below:
 
 ```bash
 docker build --build-arg SGL_BRANCH=v0.4.3.post2 -t v0.4.3.post2-rocm630 -f Dockerfile.rocm .

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -17,32 +17,54 @@ dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
 
 [project.optional-dependencies]
 runtime_common = [
-    "aiohttp", "decord", "fastapi",
-    "hf_transfer", "huggingface_hub", "interegular", "modelscope",
-    "orjson", "packaging", "pillow", "prometheus-client>=0.20.0",
-    "psutil", "pydantic", "python-multipart", "pyzmq>=25.1.2",
-    "torchao>=0.7.0", "uvicorn", "uvloop", "xgrammar==0.1.10", "ninja", "transformers==4.48.3"
+    "aiohttp",
+    "decord",
+    "fastapi",
+    "hf_transfer",
+    "huggingface_hub",
+    "interegular",
+    "modelscope",
+    "orjson",
+    "packaging",
+    "pillow",
+    "prometheus-client>=0.20.0",
+    "psutil",
+    "pydantic",
+    "python-multipart",
+    "pyzmq>=25.1.2",
+    "torchao>=0.7.0",
+    "uvicorn",
+    "uvloop",
+    "xgrammar==0.1.10",
+    "ninja",
+    "transformers==4.48.3",
 ]
 srt = [
-    "sglang[runtime_common]", "cuda-python",
-    "sgl-kernel>=0.0.3.post6", "torch", "vllm>=0.6.4.post1,<=0.7.2",
+    "sglang[runtime_common]",
+    "sgl-kernel>=0.0.3.post6",
     "flashinfer_python>=0.2.1.post2",
+    "torch==2.5.1",
+    "vllm>=0.6.4.post1,<=0.7.2",
+    "cuda-python",
     "outlines>=0.0.44,<=0.1.11",
 ]
 
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl
-srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11", "sgl-kernel>=0.0.3.post1"]
+srt_hip = ["sglang[runtime_common]", "sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
+
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
-srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<0.1.0"]
-#For Intel Gaudi(device : hpu) follow the installation guide
-#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
-srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<0.1.0"]
+srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
+
+# For Intel Gaudi(device : hpu) follow the installation guide
+# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
+srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
+
 # CPU: currently, there are no pre-built vllm wheels for CPU.
 # To install vllm for CPU, please follow the instruction here:
 # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
-srt_cpu = ["sglang[runtime_common]", "torch", "outlines>=0.0.44,<0.1.0"]
+srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
 
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -14,6 +14,7 @@
 
 import json
 import logging
+import math
 from enum import IntEnum, auto
 from typing import List, Optional, Set, Union
 
@@ -103,7 +104,20 @@ def __init__(
             self.head_dim = 256
             self.attention_arch = AttentionArch.MLA
             self.kv_lora_rank = self.hf_config.kv_lora_rank
+            self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
             self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
+            self.v_head_dim = self.hf_config.v_head_dim
+
+            # Handle rope scaling with yarn
+            self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
+            if self.hf_config.rope_scaling:
+                mscale_all_dim = self.hf_config.rope_scaling.get(
+                    "mscale_all_dim", False
+                )
+                scaling_factor = self.hf_config.rope_scaling["factor"]
+                mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+                self.scaling = self.scaling * mscale * mscale
+
         elif "MiniCPM3ForCausalLM" in self.hf_config.architectures:
             self.head_dim = 128
             self.attention_arch = AttentionArch.MLA
@@ -414,3 +428,9 @@ def is_multimodal_model(model_architectures: List[str]):
 
 def is_encoder_decoder_model(model_architectures: List[str]):
     return "MllamaForConditionalGeneration" in model_architectures
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
diff --git a/python/sglang/srt/constrained/outlines_backend.py b/python/sglang/srt/constrained/outlines_backend.py
@@ -28,17 +28,11 @@
     BaseGrammarObject,
 )
 from sglang.srt.constrained.outlines_jump_forward import OutlinesJumpForwardMap
-from sglang.srt.utils import is_hip
 
-is_hip_ = is_hip()
-
-if is_hip_:
+try:
+    from outlines.fsm.json_schema import build_regex_from_schema
+except ImportError:
     from outlines_core.fsm.json_schema import build_regex_from_schema
-else:
-    try:
-        from outlines.fsm.json_schema import build_regex_from_schema
-    except ImportError:
-        from outlines_core.fsm.json_schema import build_regex_from_schema
 
 
 logger = logging.getLogger(__name__)