From 6997e28f6e46a506eaacc18e6a3c62fcb63e60b9 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 27 Nov 2024 02:02:01 -0800 Subject: [PATCH] Revert "Use an env var SGLANG_SET_CPU_AFFINITY to set cpu affinity; turn it off by default" (#2221) --- python/sglang/bench_one_batch_server.py | 4 ++-- python/sglang/srt/configs/model_config.py | 8 ++++++-- .../srt/layers/attention/flashinfer_backend.py | 6 +++--- python/sglang/srt/managers/scheduler.py | 8 +++----- python/sglang/srt/utils.py | 13 ++++--------- python/sglang/test/test_utils.py | 4 ++-- 6 files changed, 20 insertions(+), 23 deletions(-) diff --git a/python/sglang/bench_one_batch_server.py b/python/sglang/bench_one_batch_server.py index 9d6048bc115..9737b8bd2c3 100644 --- a/python/sglang/bench_one_batch_server.py +++ b/python/sglang/bench_one_batch_server.py @@ -5,9 +5,9 @@ It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths). Usage: -python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8 +python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8 -python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 +python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 """ import argparse diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 7517657b477..a05c75693ce 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -14,13 +14,13 @@ import json import logging +import os from enum import IntEnum, auto from typing import List, Optional from transformers import PretrainedConfig from sglang.srt.hf_transformers_utils import get_config, get_context_length -from sglang.srt.utils import get_bool_env_var logger = logging.getLogger(__name__) @@ -59,9 +59,13 @@ def __init__( # Derive context length derived_context_len = get_context_length(self.hf_text_config) + allow_long_context = os.environ.get( + "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None + ) + if context_length is not None: if context_length > derived_context_len: - if get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"): + if allow_long_context: logger.warning( f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). " f"This may lead to incorrect model outputs or CUDA errors." diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index 16929498bec..fb391e627fb 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -18,7 +18,7 @@ from sglang.global_config import global_config from sglang.srt.layers.attention import AttentionBackend from sglang.srt.model_executor.forward_batch_info import ForwardBatch -from sglang.srt.utils import get_bool_env_var, is_flashinfer_available +from sglang.srt.utils import is_flashinfer_available if TYPE_CHECKING: from sglang.srt.layers.radix_attention import RadixAttention @@ -47,8 +47,8 @@ def __init__(self, model_runner: ModelRunner): # Parse constants if "SGLANG_FLASHINFER_USE_TENSOR_CORE" in os.environ: - self.decode_use_tensor_cores = get_bool_env_var( - "SGLANG_FLASHINFER_USE_TENSOR_CORE" + self.decode_use_tensor_cores = ( + os.environ["SGLANG_FLASHINFER_USE_TENSOR_CORE"].lower() == "true" ) else: if not _grouped_size_compiled_for_decode_kernels( diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 7f47a522ed9..5e8197de85c 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -71,10 +71,9 @@ broadcast_pyobj, configure_logger, crash_on_warnings, - get_bool_env_var, get_zmq_socket, + gpu_proc_affinity, kill_parent_process, - set_gpu_proc_affinity, set_random_seed, suppress_other_loggers, ) @@ -83,7 +82,7 @@ logger = logging.getLogger(__name__) # Test retract decode -test_retract = get_bool_env_var("SGLANG_TEST_RETRACT") +test_retract = os.getenv("SGLANG_TEST_RETRACT", "false").lower() == "true" class Scheduler: @@ -1406,8 +1405,7 @@ def run_scheduler_process( pipe_writer, ): # set cpu affinity to this gpu process - if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"): - set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id) + gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id) # [For Router] if env var "DP_RANK" exist, set dp_rank to the value of the env var if dp_rank is None and "DP_RANK" in os.environ: diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 0856b53f25c..0222824e640 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -72,7 +72,7 @@ def is_flashinfer_available(): Check whether flashinfer is available. As of Oct. 6, 2024, it is only available on NVIDIA GPUs. """ - if get_bool_env_var("SGLANG_IS_FLASHINFER_AVAILABLE", default="true"): + if os.environ.get("SGLANG_IS_FLASHINFER_AVAILABLE", "true") == "false": return False return torch.cuda.is_available() and not is_hip() @@ -626,7 +626,7 @@ async def authentication(request, call_next): def prepare_model_and_tokenizer(model_path: str, tokenizer_path: str): - if get_bool_env_var("SGLANG_USE_MODELSCOPE"): + if "SGLANG_USE_MODELSCOPE" in os.environ: if not os.path.exists(model_path): from modelscope import snapshot_download @@ -931,7 +931,7 @@ def get_nvgpu_memory_capacity(): def crash_on_warnings(): # Crash on warning if we are running CI tests - return get_bool_env_var("SGLANG_IS_IN_CI") + return os.getenv("SGLANG_IS_IN_CI", "false").lower() == "true" def get_device_name(device_id: int = 0) -> str: @@ -990,7 +990,7 @@ def direct_register_custom_op( my_lib._register_fake(op_name, fake_impl) -def set_gpu_proc_affinity( +def gpu_proc_affinity( tp_size: int, nnodes: int, gpu_id: int, @@ -1022,8 +1022,3 @@ def set_gpu_proc_affinity( # set cpu_affinity to current process p.cpu_affinity(bind_cpu_ids) logger.info(f"Process {pid} gpu_id {gpu_id} is running on CPUs: {p.cpu_affinity()}") - - -def get_bool_env_var(name: str, default: str = "false") -> bool: - value = os.getenv(name, default) - return value.lower() in ("true", "1") diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 3089668443e..be1755bd379 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -22,7 +22,7 @@ from sglang.global_config import global_config from sglang.lang.backend.openai import OpenAI from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint -from sglang.srt.utils import get_bool_env_var, kill_child_process +from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval from sglang.utils import get_exception_traceback @@ -44,7 +44,7 @@ def is_in_ci(): """Return whether it is in CI runner.""" - return get_bool_env_var("SGLANG_IS_IN_CI") + return os.getenv("SGLANG_IS_IN_CI", "false").lower() == "true" if is_in_ci():