From c77c1e05badb5f5bf774872c3498b21eeb0aef20 Mon Sep 17 00:00:00 2001 From: Chayenne Date: Thu, 7 Nov 2024 15:42:47 -0800 Subject: [PATCH 01/12] fix black in pre-commit (#1940) --- .pre-commit-config.yaml | 6 +- docs/backend/native_api.ipynb | 125 +++--- docs/backend/offline_engine_api.ipynb | 48 +-- docs/backend/openai_api_completions.ipynb | 109 +++--- docs/backend/openai_api_embeddings.ipynb | 48 +-- docs/backend/openai_api_vision.ipynb | 48 +-- docs/conf.py | 8 +- docs/start/send_request.ipynb | 72 ++-- .../trace_and_evaluate_rag_using_parea.ipynb | 45 ++- examples/runtime/engine/input_ids.py | 1 + python/sglang/srt/configs/model_config.py | 6 +- .../srt/layers/quantization/base_config.py | 10 +- .../srt/layers/vocab_parallel_embedding.py | 362 +++++++++++------- python/sglang/srt/managers/io_struct.py | 6 +- python/sglang/srt/managers/schedule_batch.py | 4 +- python/sglang/srt/managers/scheduler.py | 47 +-- .../sglang/srt/managers/tokenizer_manager.py | 18 +- .../sglang/srt/metrics/metrics_collector.py | 75 +++- python/sglang/srt/models/gpt2.py | 66 ++-- python/sglang/srt/server.py | 7 +- python/sglang/srt/server_args.py | 2 +- python/sglang/test/test_utils.py | 16 +- python/sglang/utils.py | 1 + rust/test_bindings.py | 2 +- scripts/playground/reference_hf.py | 10 +- test/srt/models/test_generation_models.py | 2 +- test/srt/test_openai_server.py | 1 + test/srt/test_skip_tokenizer_init.py | 1 + test/srt/test_srt_engine.py | 5 +- 29 files changed, 642 insertions(+), 509 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 43cd18118d..265339ec4b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,6 +30,6 @@ repos: rev: 24.10.0 hooks: - id: black - additional_dependencies: ['.[jupyter]'] - types: [python, jupyter] - types_or: [python, jupyter] + types: [python] + - id: black-jupyter + types: [jupyter] diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index 04cee07766..7a58c00a54 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -34,10 +34,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:08.536886Z", - "iopub.status.busy": "2024-11-05T05:08:08.536763Z", - "iopub.status.idle": "2024-11-05T05:08:34.725831Z", - "shell.execute_reply": "2024-11-05T05:08:34.725316Z" + "iopub.execute_input": "2024-11-07T18:44:42.063503Z", + "iopub.status.busy": "2024-11-07T18:44:42.063379Z", + "iopub.status.idle": "2024-11-07T18:45:07.255300Z", + "shell.execute_reply": "2024-11-07T18:45:07.254547Z" } }, "outputs": [], @@ -73,10 +73,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:34.727530Z", - "iopub.status.busy": "2024-11-05T05:08:34.727333Z", - "iopub.status.idle": "2024-11-05T05:08:35.359784Z", - "shell.execute_reply": "2024-11-05T05:08:35.359090Z" + "iopub.execute_input": "2024-11-07T18:45:07.258292Z", + "iopub.status.busy": "2024-11-07T18:45:07.257710Z", + "iopub.status.idle": "2024-11-07T18:45:07.611559Z", + "shell.execute_reply": "2024-11-07T18:45:07.610842Z" } }, "outputs": [], @@ -101,10 +101,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.362286Z", - "iopub.status.busy": "2024-11-05T05:08:35.362140Z", - "iopub.status.idle": "2024-11-05T05:08:35.368711Z", - "shell.execute_reply": "2024-11-05T05:08:35.368220Z" + "iopub.execute_input": "2024-11-07T18:45:07.613911Z", + "iopub.status.busy": "2024-11-07T18:45:07.613746Z", + "iopub.status.idle": "2024-11-07T18:45:07.620286Z", + "shell.execute_reply": "2024-11-07T18:45:07.619779Z" } }, "outputs": [], @@ -132,10 +132,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.371313Z", - "iopub.status.busy": "2024-11-05T05:08:35.370877Z", - "iopub.status.idle": "2024-11-05T05:08:35.376712Z", - "shell.execute_reply": "2024-11-05T05:08:35.376230Z" + "iopub.execute_input": "2024-11-07T18:45:07.622407Z", + "iopub.status.busy": "2024-11-07T18:45:07.622267Z", + "iopub.status.idle": "2024-11-07T18:45:07.628290Z", + "shell.execute_reply": "2024-11-07T18:45:07.627793Z" } }, "outputs": [], @@ -164,10 +164,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.378982Z", - "iopub.status.busy": "2024-11-05T05:08:35.378597Z", - "iopub.status.idle": "2024-11-05T05:08:35.391820Z", - "shell.execute_reply": "2024-11-05T05:08:35.391336Z" + "iopub.execute_input": "2024-11-07T18:45:07.630585Z", + "iopub.status.busy": "2024-11-07T18:45:07.630235Z", + "iopub.status.idle": "2024-11-07T18:45:07.643498Z", + "shell.execute_reply": "2024-11-07T18:45:07.643007Z" } }, "outputs": [], @@ -183,10 +183,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.393748Z", - "iopub.status.busy": "2024-11-05T05:08:35.393606Z", - "iopub.status.idle": "2024-11-05T05:08:35.398645Z", - "shell.execute_reply": "2024-11-05T05:08:35.398145Z" + "iopub.execute_input": "2024-11-07T18:45:07.645336Z", + "iopub.status.busy": "2024-11-07T18:45:07.645196Z", + "iopub.status.idle": "2024-11-07T18:45:07.650363Z", + "shell.execute_reply": "2024-11-07T18:45:07.649837Z" } }, "outputs": [], @@ -211,10 +211,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.400683Z", - "iopub.status.busy": "2024-11-05T05:08:35.400419Z", - "iopub.status.idle": "2024-11-05T05:08:35.406146Z", - "shell.execute_reply": "2024-11-05T05:08:35.405661Z" + "iopub.execute_input": "2024-11-07T18:45:07.652212Z", + "iopub.status.busy": "2024-11-07T18:45:07.652076Z", + "iopub.status.idle": "2024-11-07T18:45:07.658633Z", + "shell.execute_reply": "2024-11-07T18:45:07.658119Z" } }, "outputs": [], @@ -241,10 +241,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.408176Z", - "iopub.status.busy": "2024-11-05T05:08:35.407884Z", - "iopub.status.idle": "2024-11-05T05:08:35.413587Z", - "shell.execute_reply": "2024-11-05T05:08:35.413108Z" + "iopub.execute_input": "2024-11-07T18:45:07.660468Z", + "iopub.status.busy": "2024-11-07T18:45:07.660325Z", + "iopub.status.idle": "2024-11-07T18:45:07.666476Z", + "shell.execute_reply": "2024-11-07T18:45:07.665984Z" } }, "outputs": [], @@ -271,10 +271,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.416090Z", - "iopub.status.busy": "2024-11-05T05:08:35.415793Z", - "iopub.status.idle": "2024-11-05T05:08:36.552549Z", - "shell.execute_reply": "2024-11-05T05:08:36.551870Z" + "iopub.execute_input": "2024-11-07T18:45:07.668242Z", + "iopub.status.busy": "2024-11-07T18:45:07.668108Z", + "iopub.status.idle": "2024-11-07T18:45:08.725709Z", + "shell.execute_reply": "2024-11-07T18:45:08.725021Z" } }, "outputs": [], @@ -296,10 +296,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:36.554823Z", - "iopub.status.busy": "2024-11-05T05:08:36.554680Z", - "iopub.status.idle": "2024-11-05T05:08:38.053945Z", - "shell.execute_reply": "2024-11-05T05:08:38.053034Z" + "iopub.execute_input": "2024-11-07T18:45:08.727865Z", + "iopub.status.busy": "2024-11-07T18:45:08.727721Z", + "iopub.status.idle": "2024-11-07T18:45:11.165841Z", + "shell.execute_reply": "2024-11-07T18:45:11.165282Z" } }, "outputs": [], @@ -335,10 +335,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:38.056783Z", - "iopub.status.busy": "2024-11-05T05:08:38.056497Z", - "iopub.status.idle": "2024-11-05T05:09:04.436030Z", - "shell.execute_reply": "2024-11-05T05:09:04.435311Z" + "iopub.execute_input": "2024-11-07T18:45:11.167853Z", + "iopub.status.busy": "2024-11-07T18:45:11.167711Z", + "iopub.status.idle": "2024-11-07T18:45:39.542988Z", + "shell.execute_reply": "2024-11-07T18:45:39.542135Z" } }, "outputs": [], @@ -360,10 +360,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:09:04.438987Z", - "iopub.status.busy": "2024-11-05T05:09:04.438568Z", - "iopub.status.idle": "2024-11-05T05:09:04.485291Z", - "shell.execute_reply": "2024-11-05T05:09:04.484829Z" + "iopub.execute_input": "2024-11-07T18:45:39.545416Z", + "iopub.status.busy": "2024-11-07T18:45:39.545005Z", + "iopub.status.idle": "2024-11-07T18:45:39.588793Z", + "shell.execute_reply": "2024-11-07T18:45:39.588054Z" } }, "outputs": [], @@ -392,10 +392,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:09:04.487191Z", - "iopub.status.busy": "2024-11-05T05:09:04.486929Z", - "iopub.status.idle": "2024-11-05T05:09:25.553481Z", - "shell.execute_reply": "2024-11-05T05:09:25.552747Z" + "iopub.execute_input": "2024-11-07T18:45:39.590729Z", + "iopub.status.busy": "2024-11-07T18:45:39.590446Z", + "iopub.status.idle": "2024-11-07T18:45:59.660376Z", + "shell.execute_reply": "2024-11-07T18:45:59.659992Z" } }, "outputs": [], @@ -419,10 +419,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:09:25.555813Z", - "iopub.status.busy": "2024-11-05T05:09:25.555666Z", - "iopub.status.idle": "2024-11-05T05:09:26.354372Z", - "shell.execute_reply": "2024-11-05T05:09:26.353693Z" + "iopub.execute_input": "2024-11-07T18:45:59.661779Z", + "iopub.status.busy": "2024-11-07T18:45:59.661641Z", + "iopub.status.idle": "2024-11-07T18:46:00.475726Z", + "shell.execute_reply": "2024-11-07T18:46:00.475269Z" } }, "outputs": [], @@ -445,10 +445,7 @@ "prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n", "\n", "url = \"http://localhost:30030/classify\"\n", - "data = {\n", - " \"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \n", - " \"text\": prompts\n", - "}\n", + "data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n", "\n", "responses = requests.post(url, json=data).json()\n", "for response in responses:\n", @@ -460,10 +457,10 @@ "execution_count": 15, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:09:26.356532Z", - "iopub.status.busy": "2024-11-05T05:09:26.356327Z", - "iopub.status.idle": "2024-11-05T05:09:26.396590Z", - "shell.execute_reply": "2024-11-05T05:09:26.395914Z" + "iopub.execute_input": "2024-11-07T18:46:00.477283Z", + "iopub.status.busy": "2024-11-07T18:46:00.477025Z", + "iopub.status.idle": "2024-11-07T18:46:00.525758Z", + "shell.execute_reply": "2024-11-07T18:46:00.525236Z" } }, "outputs": [], diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb index 48f2800042..f2c5076409 100644 --- a/docs/backend/offline_engine_api.ipynb +++ b/docs/backend/offline_engine_api.ipynb @@ -35,10 +35,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:21:27.503026Z", - "iopub.status.busy": "2024-11-05T05:21:27.502741Z", - "iopub.status.idle": "2024-11-05T05:21:49.554631Z", - "shell.execute_reply": "2024-11-05T05:21:49.553690Z" + "iopub.execute_input": "2024-11-07T18:46:04.789536Z", + "iopub.status.busy": "2024-11-07T18:46:04.789418Z", + "iopub.status.idle": "2024-11-07T18:46:27.038169Z", + "shell.execute_reply": "2024-11-07T18:46:27.037540Z" } }, "outputs": [], @@ -64,10 +64,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:21:49.558275Z", - "iopub.status.busy": "2024-11-05T05:21:49.558110Z", - "iopub.status.idle": "2024-11-05T05:21:52.717287Z", - "shell.execute_reply": "2024-11-05T05:21:52.716842Z" + "iopub.execute_input": "2024-11-07T18:46:27.040005Z", + "iopub.status.busy": "2024-11-07T18:46:27.039872Z", + "iopub.status.idle": "2024-11-07T18:46:30.203840Z", + "shell.execute_reply": "2024-11-07T18:46:30.203368Z" } }, "outputs": [], @@ -99,10 +99,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:21:52.721738Z", - "iopub.status.busy": "2024-11-05T05:21:52.720908Z", - "iopub.status.idle": "2024-11-05T05:22:01.770341Z", - "shell.execute_reply": "2024-11-05T05:22:01.769510Z" + "iopub.execute_input": "2024-11-07T18:46:30.205880Z", + "iopub.status.busy": "2024-11-07T18:46:30.205719Z", + "iopub.status.idle": "2024-11-07T18:46:39.256561Z", + "shell.execute_reply": "2024-11-07T18:46:39.255880Z" } }, "outputs": [], @@ -137,10 +137,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:01.772662Z", - "iopub.status.busy": "2024-11-05T05:22:01.772377Z", - "iopub.status.idle": "2024-11-05T05:22:04.897499Z", - "shell.execute_reply": "2024-11-05T05:22:04.896867Z" + "iopub.execute_input": "2024-11-07T18:46:39.259464Z", + "iopub.status.busy": "2024-11-07T18:46:39.259309Z", + "iopub.status.idle": "2024-11-07T18:46:42.384955Z", + "shell.execute_reply": "2024-11-07T18:46:42.384378Z" } }, "outputs": [], @@ -179,10 +179,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:04.899754Z", - "iopub.status.busy": "2024-11-05T05:22:04.899478Z", - "iopub.status.idle": "2024-11-05T05:22:13.970245Z", - "shell.execute_reply": "2024-11-05T05:22:13.969779Z" + "iopub.execute_input": "2024-11-07T18:46:42.387431Z", + "iopub.status.busy": "2024-11-07T18:46:42.387279Z", + "iopub.status.idle": "2024-11-07T18:46:51.448572Z", + "shell.execute_reply": "2024-11-07T18:46:51.447781Z" } }, "outputs": [], @@ -216,10 +216,10 @@ "execution_count": 6, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:13.972039Z", - "iopub.status.busy": "2024-11-05T05:22:13.971846Z", - "iopub.status.idle": "2024-11-05T05:22:14.027421Z", - "shell.execute_reply": "2024-11-05T05:22:14.027003Z" + "iopub.execute_input": "2024-11-07T18:46:51.451177Z", + "iopub.status.busy": "2024-11-07T18:46:51.450952Z", + "iopub.status.idle": "2024-11-07T18:46:51.497530Z", + "shell.execute_reply": "2024-11-07T18:46:51.496850Z" } }, "outputs": [], diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 1dfa531299..776af13f87 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -39,10 +39,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:09:30.637832Z", - "iopub.status.busy": "2024-11-05T05:09:30.637709Z", - "iopub.status.idle": "2024-11-05T05:09:58.830158Z", - "shell.execute_reply": "2024-11-05T05:09:58.829395Z" + "iopub.execute_input": "2024-11-07T18:46:54.813876Z", + "iopub.status.busy": "2024-11-07T18:46:54.813741Z", + "iopub.status.idle": "2024-11-07T18:47:24.015527Z", + "shell.execute_reply": "2024-11-07T18:47:24.014987Z" } }, "outputs": [], @@ -79,10 +79,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:09:58.833008Z", - "iopub.status.busy": "2024-11-05T05:09:58.832805Z", - "iopub.status.idle": "2024-11-05T05:10:00.187146Z", - "shell.execute_reply": "2024-11-05T05:10:00.186657Z" + "iopub.execute_input": "2024-11-07T18:47:24.018153Z", + "iopub.status.busy": "2024-11-07T18:47:24.017755Z", + "iopub.status.idle": "2024-11-07T18:47:25.374821Z", + "shell.execute_reply": "2024-11-07T18:47:25.374397Z" } }, "outputs": [], @@ -119,10 +119,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:00.189444Z", - "iopub.status.busy": "2024-11-05T05:10:00.189289Z", - "iopub.status.idle": "2024-11-05T05:10:03.291891Z", - "shell.execute_reply": "2024-11-05T05:10:03.291173Z" + "iopub.execute_input": "2024-11-07T18:47:25.376617Z", + "iopub.status.busy": "2024-11-07T18:47:25.376495Z", + "iopub.status.idle": "2024-11-07T18:47:28.482537Z", + "shell.execute_reply": "2024-11-07T18:47:28.482125Z" } }, "outputs": [], @@ -165,10 +165,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:03.294389Z", - "iopub.status.busy": "2024-11-05T05:10:03.294237Z", - "iopub.status.idle": "2024-11-05T05:10:03.469357Z", - "shell.execute_reply": "2024-11-05T05:10:03.468661Z" + "iopub.execute_input": "2024-11-07T18:47:28.484819Z", + "iopub.status.busy": "2024-11-07T18:47:28.484673Z", + "iopub.status.idle": "2024-11-07T18:47:28.659814Z", + "shell.execute_reply": "2024-11-07T18:47:28.659435Z" } }, "outputs": [], @@ -198,10 +198,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:03.471573Z", - "iopub.status.busy": "2024-11-05T05:10:03.471430Z", - "iopub.status.idle": "2024-11-05T05:10:04.977081Z", - "shell.execute_reply": "2024-11-05T05:10:04.976391Z" + "iopub.execute_input": "2024-11-07T18:47:28.661844Z", + "iopub.status.busy": "2024-11-07T18:47:28.661710Z", + "iopub.status.idle": "2024-11-07T18:47:30.168922Z", + "shell.execute_reply": "2024-11-07T18:47:30.168600Z" } }, "outputs": [], @@ -234,10 +234,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:04.979428Z", - "iopub.status.busy": "2024-11-05T05:10:04.979272Z", - "iopub.status.idle": "2024-11-05T05:10:08.568761Z", - "shell.execute_reply": "2024-11-05T05:10:08.568355Z" + "iopub.execute_input": "2024-11-07T18:47:30.171319Z", + "iopub.status.busy": "2024-11-07T18:47:30.171176Z", + "iopub.status.idle": "2024-11-07T18:47:33.760113Z", + "shell.execute_reply": "2024-11-07T18:47:33.759713Z" } }, "outputs": [], @@ -273,10 +273,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:08.571102Z", - "iopub.status.busy": "2024-11-05T05:10:08.570964Z", - "iopub.status.idle": "2024-11-05T05:10:23.214087Z", - "shell.execute_reply": "2024-11-05T05:10:23.213664Z" + "iopub.execute_input": "2024-11-07T18:47:33.762729Z", + "iopub.status.busy": "2024-11-07T18:47:33.762590Z", + "iopub.status.idle": "2024-11-07T18:47:34.255316Z", + "shell.execute_reply": "2024-11-07T18:47:34.254907Z" } }, "outputs": [], @@ -297,7 +297,10 @@ "response = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " messages=[\n", - " {\"role\": \"user\", \"content\": \"Give me the information of the capital of France in the JSON format.\"},\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Give me the information of the capital of France in the JSON format.\",\n", + " },\n", " ],\n", " temperature=0,\n", " max_tokens=128,\n", @@ -322,10 +325,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:23.216229Z", - "iopub.status.busy": "2024-11-05T05:10:23.216076Z", - "iopub.status.idle": "2024-11-05T05:10:23.884236Z", - "shell.execute_reply": "2024-11-05T05:10:23.883897Z" + "iopub.execute_input": "2024-11-07T18:47:34.257393Z", + "iopub.status.busy": "2024-11-07T18:47:34.257246Z", + "iopub.status.idle": "2024-11-07T18:47:34.413506Z", + "shell.execute_reply": "2024-11-07T18:47:34.413172Z" } }, "outputs": [], @@ -365,10 +368,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:23.886276Z", - "iopub.status.busy": "2024-11-05T05:10:23.886136Z", - "iopub.status.idle": "2024-11-05T05:10:23.905880Z", - "shell.execute_reply": "2024-11-05T05:10:23.905529Z" + "iopub.execute_input": "2024-11-07T18:47:34.414816Z", + "iopub.status.busy": "2024-11-07T18:47:34.414541Z", + "iopub.status.idle": "2024-11-07T18:47:34.431341Z", + "shell.execute_reply": "2024-11-07T18:47:34.431081Z" } }, "outputs": [], @@ -427,10 +430,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:23.907468Z", - "iopub.status.busy": "2024-11-05T05:10:23.907247Z", - "iopub.status.idle": "2024-11-05T05:10:26.920212Z", - "shell.execute_reply": "2024-11-05T05:10:26.919865Z" + "iopub.execute_input": "2024-11-07T18:47:34.432325Z", + "iopub.status.busy": "2024-11-07T18:47:34.432208Z", + "iopub.status.idle": "2024-11-07T18:47:37.444337Z", + "shell.execute_reply": "2024-11-07T18:47:37.444000Z" } }, "outputs": [], @@ -482,10 +485,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:26.922675Z", - "iopub.status.busy": "2024-11-05T05:10:26.922413Z", - "iopub.status.idle": "2024-11-05T05:10:51.961703Z", - "shell.execute_reply": "2024-11-05T05:10:51.960846Z" + "iopub.execute_input": "2024-11-07T18:47:37.445894Z", + "iopub.status.busy": "2024-11-07T18:47:37.445744Z", + "iopub.status.idle": "2024-11-07T18:48:02.482532Z", + "shell.execute_reply": "2024-11-07T18:48:02.482042Z" } }, "outputs": [], @@ -565,10 +568,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:51.964749Z", - "iopub.status.busy": "2024-11-05T05:10:51.964215Z", - "iopub.status.idle": "2024-11-05T05:11:05.023450Z", - "shell.execute_reply": "2024-11-05T05:11:05.023101Z" + "iopub.execute_input": "2024-11-07T18:48:02.485206Z", + "iopub.status.busy": "2024-11-07T18:48:02.485064Z", + "iopub.status.idle": "2024-11-07T18:48:15.521489Z", + "shell.execute_reply": "2024-11-07T18:48:15.521156Z" } }, "outputs": [], @@ -660,10 +663,10 @@ "execution_count": 13, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:05.024877Z", - "iopub.status.busy": "2024-11-05T05:11:05.024561Z", - "iopub.status.idle": "2024-11-05T05:11:06.358695Z", - "shell.execute_reply": "2024-11-05T05:11:06.357635Z" + "iopub.execute_input": "2024-11-07T18:48:15.522794Z", + "iopub.status.busy": "2024-11-07T18:48:15.522657Z", + "iopub.status.idle": "2024-11-07T18:48:16.875740Z", + "shell.execute_reply": "2024-11-07T18:48:16.874847Z" } }, "outputs": [], diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb index a221c16eb4..078024f011 100644 --- a/docs/backend/openai_api_embeddings.ipynb +++ b/docs/backend/openai_api_embeddings.ipynb @@ -35,10 +35,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:17.227174Z", - "iopub.status.busy": "2024-11-05T05:22:17.226952Z", - "iopub.status.idle": "2024-11-05T05:22:42.445791Z", - "shell.execute_reply": "2024-11-05T05:22:42.444980Z" + "iopub.execute_input": "2024-11-07T18:48:21.128020Z", + "iopub.status.busy": "2024-11-07T18:48:21.127898Z", + "iopub.status.idle": "2024-11-07T18:48:45.310371Z", + "shell.execute_reply": "2024-11-07T18:48:45.309469Z" } }, "outputs": [], @@ -72,10 +72,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:42.448147Z", - "iopub.status.busy": "2024-11-05T05:22:42.447775Z", - "iopub.status.idle": "2024-11-05T05:22:42.495311Z", - "shell.execute_reply": "2024-11-05T05:22:42.495027Z" + "iopub.execute_input": "2024-11-07T18:48:45.313506Z", + "iopub.status.busy": "2024-11-07T18:48:45.313123Z", + "iopub.status.idle": "2024-11-07T18:48:45.364918Z", + "shell.execute_reply": "2024-11-07T18:48:45.364155Z" } }, "outputs": [], @@ -106,10 +106,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:42.496666Z", - "iopub.status.busy": "2024-11-05T05:22:42.496524Z", - "iopub.status.idle": "2024-11-05T05:22:42.540687Z", - "shell.execute_reply": "2024-11-05T05:22:42.540060Z" + "iopub.execute_input": "2024-11-07T18:48:45.367776Z", + "iopub.status.busy": "2024-11-07T18:48:45.367490Z", + "iopub.status.idle": "2024-11-07T18:48:45.411386Z", + "shell.execute_reply": "2024-11-07T18:48:45.411134Z" } }, "outputs": [], @@ -140,10 +140,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:42.542551Z", - "iopub.status.busy": "2024-11-05T05:22:42.542282Z", - "iopub.status.idle": "2024-11-05T05:22:42.928542Z", - "shell.execute_reply": "2024-11-05T05:22:42.928181Z" + "iopub.execute_input": "2024-11-07T18:48:45.412462Z", + "iopub.status.busy": "2024-11-07T18:48:45.412351Z", + "iopub.status.idle": "2024-11-07T18:48:45.768796Z", + "shell.execute_reply": "2024-11-07T18:48:45.768406Z" } }, "outputs": [], @@ -176,10 +176,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:42.930093Z", - "iopub.status.busy": "2024-11-05T05:22:42.929954Z", - "iopub.status.idle": "2024-11-05T05:22:44.799945Z", - "shell.execute_reply": "2024-11-05T05:22:44.799562Z" + "iopub.execute_input": "2024-11-07T18:48:45.770227Z", + "iopub.status.busy": "2024-11-07T18:48:45.770106Z", + "iopub.status.idle": "2024-11-07T18:48:47.447065Z", + "shell.execute_reply": "2024-11-07T18:48:47.446733Z" } }, "outputs": [], @@ -208,10 +208,10 @@ "execution_count": 6, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:44.801418Z", - "iopub.status.busy": "2024-11-05T05:22:44.801192Z", - "iopub.status.idle": "2024-11-05T05:22:45.094634Z", - "shell.execute_reply": "2024-11-05T05:22:45.093950Z" + "iopub.execute_input": "2024-11-07T18:48:47.448510Z", + "iopub.status.busy": "2024-11-07T18:48:47.448337Z", + "iopub.status.idle": "2024-11-07T18:48:47.743336Z", + "shell.execute_reply": "2024-11-07T18:48:47.742276Z" } }, "outputs": [], diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb index cbbba8c12d..ef0fd40e36 100644 --- a/docs/backend/openai_api_vision.ipynb +++ b/docs/backend/openai_api_vision.ipynb @@ -39,10 +39,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:49.320999Z", - "iopub.status.busy": "2024-11-05T05:22:49.320880Z", - "iopub.status.idle": "2024-11-05T05:23:21.537478Z", - "shell.execute_reply": "2024-11-05T05:23:21.536956Z" + "iopub.execute_input": "2024-11-07T18:43:47.311708Z", + "iopub.status.busy": "2024-11-07T18:43:47.311517Z", + "iopub.status.idle": "2024-11-07T18:44:18.512576Z", + "shell.execute_reply": "2024-11-07T18:44:18.511909Z" } }, "outputs": [], @@ -78,10 +78,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:23:21.539953Z", - "iopub.status.busy": "2024-11-05T05:23:21.539100Z", - "iopub.status.idle": "2024-11-05T05:23:25.880179Z", - "shell.execute_reply": "2024-11-05T05:23:25.879744Z" + "iopub.execute_input": "2024-11-07T18:44:18.515678Z", + "iopub.status.busy": "2024-11-07T18:44:18.515314Z", + "iopub.status.idle": "2024-11-07T18:44:22.880793Z", + "shell.execute_reply": "2024-11-07T18:44:22.880303Z" } }, "outputs": [], @@ -129,10 +129,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:23:25.881742Z", - "iopub.status.busy": "2024-11-05T05:23:25.881595Z", - "iopub.status.idle": "2024-11-05T05:23:26.758503Z", - "shell.execute_reply": "2024-11-05T05:23:26.758084Z" + "iopub.execute_input": "2024-11-07T18:44:22.883309Z", + "iopub.status.busy": "2024-11-07T18:44:22.883160Z", + "iopub.status.idle": "2024-11-07T18:44:27.048810Z", + "shell.execute_reply": "2024-11-07T18:44:27.048074Z" } }, "outputs": [], @@ -176,10 +176,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:23:26.760098Z", - "iopub.status.busy": "2024-11-05T05:23:26.759955Z", - "iopub.status.idle": "2024-11-05T05:23:27.849510Z", - "shell.execute_reply": "2024-11-05T05:23:27.849117Z" + "iopub.execute_input": "2024-11-07T18:44:27.051312Z", + "iopub.status.busy": "2024-11-07T18:44:27.051190Z", + "iopub.status.idle": "2024-11-07T18:44:32.358097Z", + "shell.execute_reply": "2024-11-07T18:44:32.357628Z" } }, "outputs": [], @@ -227,10 +227,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:23:27.850994Z", - "iopub.status.busy": "2024-11-05T05:23:27.850864Z", - "iopub.status.idle": "2024-11-05T05:23:31.609137Z", - "shell.execute_reply": "2024-11-05T05:23:31.608748Z" + "iopub.execute_input": "2024-11-07T18:44:32.359532Z", + "iopub.status.busy": "2024-11-07T18:44:32.359413Z", + "iopub.status.idle": "2024-11-07T18:44:36.164664Z", + "shell.execute_reply": "2024-11-07T18:44:36.164005Z" } }, "outputs": [], @@ -276,10 +276,10 @@ "execution_count": 6, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:23:31.610683Z", - "iopub.status.busy": "2024-11-05T05:23:31.610560Z", - "iopub.status.idle": "2024-11-05T05:23:32.965146Z", - "shell.execute_reply": "2024-11-05T05:23:32.963922Z" + "iopub.execute_input": "2024-11-07T18:44:36.167123Z", + "iopub.status.busy": "2024-11-07T18:44:36.166535Z", + "iopub.status.idle": "2024-11-07T18:44:37.743761Z", + "shell.execute_reply": "2024-11-07T18:44:37.742510Z" } }, "outputs": [], diff --git a/docs/conf.py b/docs/conf.py index e0656bb65c..00153f98ff 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -31,7 +31,7 @@ ] nbsphinx_allow_errors = True -nbsphinx_execute = 'never' +nbsphinx_execute = "never" autosectionlabel_prefix_document = True nbsphinx_allow_directives = True @@ -49,7 +49,7 @@ myst_heading_anchors = 3 -nbsphinx_kernel_name = 'python3' +nbsphinx_kernel_name = "python3" nbsphinx_execute_arguments = [ "--InlineBackend.figure_formats={'svg', 'pdf'}", "--InlineBackend.rc={'figure.dpi': 96}", @@ -130,8 +130,10 @@ html_static_path = ["_static"] html_css_files = ["css/custom_log.css"] + def setup(app): - app.add_css_file('css/custom_log.css') + app.add_css_file("css/custom_log.css") + myst_enable_extensions = [ "dollarmath", diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb index ed1ea61394..684b1c8d83 100644 --- a/docs/start/send_request.ipynb +++ b/docs/start/send_request.ipynb @@ -33,10 +33,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:10.680191Z", - "iopub.status.busy": "2024-11-05T05:11:10.679710Z", - "iopub.status.idle": "2024-11-05T05:11:39.882385Z", - "shell.execute_reply": "2024-11-05T05:11:39.881827Z" + "iopub.execute_input": "2024-11-07T18:48:52.032229Z", + "iopub.status.busy": "2024-11-07T18:48:52.032105Z", + "iopub.status.idle": "2024-11-07T18:49:20.226042Z", + "shell.execute_reply": "2024-11-07T18:49:20.225562Z" } }, "outputs": [], @@ -49,7 +49,7 @@ ")\n", "\n", "server_process = execute_shell_command(\n", - "\"\"\"\n", + " \"\"\"\n", "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", "--port 30000 --host 0.0.0.0\n", "\"\"\"\n", @@ -70,10 +70,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:39.883923Z", - "iopub.status.busy": "2024-11-05T05:11:39.883721Z", - "iopub.status.idle": "2024-11-05T05:11:40.124980Z", - "shell.execute_reply": "2024-11-05T05:11:40.124557Z" + "iopub.execute_input": "2024-11-07T18:49:20.228006Z", + "iopub.status.busy": "2024-11-07T18:49:20.227572Z", + "iopub.status.idle": "2024-11-07T18:49:20.469885Z", + "shell.execute_reply": "2024-11-07T18:49:20.469518Z" } }, "outputs": [], @@ -101,10 +101,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:40.126564Z", - "iopub.status.busy": "2024-11-05T05:11:40.126369Z", - "iopub.status.idle": "2024-11-05T05:11:40.324316Z", - "shell.execute_reply": "2024-11-05T05:11:40.323693Z" + "iopub.execute_input": "2024-11-07T18:49:20.471956Z", + "iopub.status.busy": "2024-11-07T18:49:20.471811Z", + "iopub.status.idle": "2024-11-07T18:49:20.667997Z", + "shell.execute_reply": "2024-11-07T18:49:20.667630Z" } }, "outputs": [], @@ -115,9 +115,7 @@ "\n", "data = {\n", " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", - " \"messages\": [\n", - " {\"role\": \"user\", \"content\": \"What is the capital of France?\"}\n", - " ]\n", + " \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n", "}\n", "\n", "response = requests.post(url, json=data)\n", @@ -136,10 +134,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:40.327043Z", - "iopub.status.busy": "2024-11-05T05:11:40.326759Z", - "iopub.status.idle": "2024-11-05T05:11:41.687336Z", - "shell.execute_reply": "2024-11-05T05:11:41.686855Z" + "iopub.execute_input": "2024-11-07T18:49:20.669977Z", + "iopub.status.busy": "2024-11-07T18:49:20.669826Z", + "iopub.status.idle": "2024-11-07T18:49:22.004855Z", + "shell.execute_reply": "2024-11-07T18:49:22.004472Z" } }, "outputs": [], @@ -171,10 +169,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:41.688676Z", - "iopub.status.busy": "2024-11-05T05:11:41.688527Z", - "iopub.status.idle": "2024-11-05T05:11:42.717140Z", - "shell.execute_reply": "2024-11-05T05:11:42.716452Z" + "iopub.execute_input": "2024-11-07T18:49:22.006983Z", + "iopub.status.busy": "2024-11-07T18:49:22.006858Z", + "iopub.status.idle": "2024-11-07T18:49:23.029098Z", + "shell.execute_reply": "2024-11-07T18:49:23.028697Z" } }, "outputs": [], @@ -197,7 +195,7 @@ "# Handle the streaming output\n", "for chunk in response:\n", " if chunk.choices[0].delta.content:\n", - " print(chunk.choices[0].delta.content, end='', flush=True)" + " print(chunk.choices[0].delta.content, end=\"\", flush=True)" ] }, { @@ -214,10 +212,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:42.720467Z", - "iopub.status.busy": "2024-11-05T05:11:42.720182Z", - "iopub.status.idle": "2024-11-05T05:11:43.480765Z", - "shell.execute_reply": "2024-11-05T05:11:43.480143Z" + "iopub.execute_input": "2024-11-07T18:49:23.031712Z", + "iopub.status.busy": "2024-11-07T18:49:23.031571Z", + "iopub.status.idle": "2024-11-07T18:49:23.787752Z", + "shell.execute_reply": "2024-11-07T18:49:23.787368Z" } }, "outputs": [], @@ -250,10 +248,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:43.483575Z", - "iopub.status.busy": "2024-11-05T05:11:43.483295Z", - "iopub.status.idle": "2024-11-05T05:11:44.242950Z", - "shell.execute_reply": "2024-11-05T05:11:44.242248Z" + "iopub.execute_input": "2024-11-07T18:49:23.789840Z", + "iopub.status.busy": "2024-11-07T18:49:23.789702Z", + "iopub.status.idle": "2024-11-07T18:49:24.545631Z", + "shell.execute_reply": "2024-11-07T18:49:24.545241Z" } }, "outputs": [], @@ -290,10 +288,10 @@ "execution_count": 8, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:44.245660Z", - "iopub.status.busy": "2024-11-05T05:11:44.245373Z", - "iopub.status.idle": "2024-11-05T05:11:45.591682Z", - "shell.execute_reply": "2024-11-05T05:11:45.591184Z" + "iopub.execute_input": "2024-11-07T18:49:24.547641Z", + "iopub.status.busy": "2024-11-07T18:49:24.547497Z", + "iopub.status.idle": "2024-11-07T18:49:25.888864Z", + "shell.execute_reply": "2024-11-07T18:49:25.888114Z" } }, "outputs": [], diff --git a/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb b/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb index 25b91b7d1d..83576d3d01 100644 --- a/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb +++ b/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb @@ -71,7 +71,7 @@ "source": [ "import json\n", "import os\n", - "from typing import List\n", + "from typing import List\n", "\n", "import chromadb\n", "\n", @@ -80,7 +80,7 @@ "if not os.path.exists(path_qca):\n", " !wget https://virattt.github.io/datasets/abnb-2023-10k.json -O airbnb-2023-10k-qca.json\n", "\n", - "with open(path_qca, 'r') as f:\n", + "with open(path_qca, \"r\") as f:\n", " question_context_answers = json.load(f)\n", "\n", "chroma_client = chromadb.PersistentClient()\n", @@ -88,7 +88,7 @@ "if collection.count() == 0:\n", " collection.add(\n", " documents=[qca[\"context\"] for qca in question_context_answers],\n", - " ids=[str(i) for i in range(len(question_context_answers))]\n", + " ids=[str(i) for i in range(len(question_context_answers))],\n", " )" ], "metadata": { @@ -123,7 +123,7 @@ "\n", "load_dotenv()\n", "\n", - "os.environ['TOKENIZERS_PARALLELISM'] = \"false\"\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", "\n", "p = Parea(api_key=os.getenv(\"PAREA_API_KEY\"), project_name=\"rag_sglang\")\n", "p.integrate_with_sglang()\n", @@ -150,10 +150,7 @@ "source": [ "@trace\n", "def retrieval(question: str) -> List[str]:\n", - " return collection.query(\n", - " query_texts=[question],\n", - " n_results=1\n", - " )['documents'][0]" + " return collection.query(query_texts=[question], n_results=1)[\"documents\"][0]" ], "metadata": { "collapsed": false @@ -176,7 +173,9 @@ "@function\n", "def generation_sglang(s, question: str, *context: str):\n", " context = \"\\n\".join(context)\n", - " s += user(f'Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.')\n", + " s += user(\n", + " f\"Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.\"\n", + " )\n", " s += assistant(gen(\"answer\"))\n", "\n", "\n", @@ -223,7 +222,9 @@ " return generation(question, *contexts)\n", "\n", "\n", - "rag_pipeline(\"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\")" + "rag_pipeline(\n", + " \"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\"\n", + ")" ] }, { @@ -271,7 +272,10 @@ "execution_count": null, "outputs": [], "source": [ - "from parea.evals.rag import context_query_relevancy_factory, percent_target_supported_by_context_factory\n", + "from parea.evals.rag import (\n", + " context_query_relevancy_factory,\n", + " percent_target_supported_by_context_factory,\n", + ")\n", "\n", "\n", "context_relevancy_eval = context_query_relevancy_factory()\n", @@ -280,10 +284,7 @@ "\n", "@trace(eval_funcs=[context_relevancy_eval, percent_target_supported_by_context])\n", "def retrieval(question: str) -> List[str]:\n", - " return collection.query(\n", - " query_texts=[question],\n", - " n_results=1\n", - " )['documents'][0]" + " return collection.query(query_texts=[question], n_results=1)[\"documents\"][0]" ], "metadata": { "collapsed": false @@ -310,10 +311,13 @@ "answer_context_faithfulness = answer_context_faithfulness_statement_level_factory()\n", "answer_matches_target_llm_grader = answer_matches_target_llm_grader_factory()\n", "\n", + "\n", "@function\n", "def generation_sglang(s, question: str, *context: str):\n", " context = \"\\n\".join(context)\n", - " s += user(f'Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.')\n", + " s += user(\n", + " f\"Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.\"\n", + " )\n", " s += assistant(gen(\"answer\", max_tokens=1_000))\n", "\n", "\n", @@ -357,7 +361,9 @@ " return generation(question, *contexts)\n", "\n", "\n", - "rag_pipeline(\"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\")" + "rag_pipeline(\n", + " \"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\"\n", + ")" ], "metadata": { "collapsed": false @@ -402,6 +408,7 @@ "source": [ "!pip install nest-asyncio\n", "import nest_asyncio\n", + "\n", "nest_asyncio.apply()" ], "metadata": { @@ -461,7 +468,7 @@ ], "source": [ "e = p.experiment(\n", - " 'RAG',\n", + " \"RAG\",\n", " data=[\n", " {\n", " \"question\": qca[\"question\"],\n", @@ -469,7 +476,7 @@ " }\n", " for qca in question_context_answers\n", " ],\n", - " func=rag_pipeline\n", + " func=rag_pipeline,\n", ").run()" ], "metadata": { diff --git a/examples/runtime/engine/input_ids.py b/examples/runtime/engine/input_ids.py index fd7eb7e22c..168796a810 100644 --- a/examples/runtime/engine/input_ids.py +++ b/examples/runtime/engine/input_ids.py @@ -7,6 +7,7 @@ MODEL_PATH = "meta-llama/Llama-3.1-8B-Instruct" + def main(): # Sample prompts. prompts = [ diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index c37cfefbd7..2ce6d74598 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -39,7 +39,7 @@ def __init__( revision: Optional[str] = None, context_length: Optional[int] = None, model_override_args: Optional[dict] = None, - is_embedding: Optional[bool] = None + is_embedding: Optional[bool] = None, ) -> None: # Parse args self.model_override_args = json.loads(model_override_args) @@ -52,7 +52,9 @@ def __init__( self.hf_text_config = get_hf_text_config(self.hf_config) # Check model type - self.is_generation = is_generation_model(self.hf_config.architectures, is_embedding) + self.is_generation = is_generation_model( + self.hf_config.architectures, is_embedding + ) self.is_multimodal = is_multimodal_model(self.hf_config.architectures) self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures) diff --git a/python/sglang/srt/layers/quantization/base_config.py b/python/sglang/srt/layers/quantization/base_config.py index f4f5d2b47b..e45dda2cc4 100644 --- a/python/sglang/srt/layers/quantization/base_config.py +++ b/python/sglang/srt/layers/quantization/base_config.py @@ -122,16 +122,14 @@ def get_scaled_act_names(self) -> List[str]: """ raise NotImplementedError -def method_has_implemented_embedding( - method_class: Type[QuantizeMethodBase]) -> bool: + +def method_has_implemented_embedding(method_class: Type[QuantizeMethodBase]) -> bool: """ Not all quant methods have embedding implemented, so we need to check that it exists for our given method. We check this by making sure the function has been changed from the base implementation. """ - base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding", - None) + base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding", None) class_embedding = inspect.getattr_static(method_class, "embedding", None) - return (class_embedding is not None - and class_embedding is not base_embedding) + return class_embedding is not None and class_embedding is not base_embedding diff --git a/python/sglang/srt/layers/vocab_parallel_embedding.py b/python/sglang/srt/layers/vocab_parallel_embedding.py index c1e758b022..a2d15fc781 100644 --- a/python/sglang/srt/layers/vocab_parallel_embedding.py +++ b/python/sglang/srt/layers/vocab_parallel_embedding.py @@ -27,59 +27,67 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase): """Unquantized method for embeddings.""" - def create_weights(self, layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, - output_size: int, params_dtype: torch.dtype, - **extra_weight_attrs): + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): """Create weights for embedding layer.""" - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=params_dtype), - requires_grad=False) + weight = Parameter( + torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) layer.register_parameter("weight", weight) set_weight_attrs(weight, extra_weight_attrs) - def apply(self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: return F.linear(x, layer.weight, bias) - def embedding(self, layer: torch.nn.Module, - input_: torch.Tensor) -> torch.Tensor: + def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor: return F.embedding(input_, layer.weight) -def pad_vocab_size(vocab_size: int, - pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int: +def pad_vocab_size(vocab_size: int, pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int: """Pad the vocab size to the given value.""" return ((vocab_size + pad_to - 1) // pad_to) * pad_to def vocab_range_from_per_partition_vocab_size( - per_partition_vocab_size: int, - rank: int, - offset: int = 0) -> Sequence[int]: + per_partition_vocab_size: int, rank: int, offset: int = 0 +) -> Sequence[int]: index_f = rank * per_partition_vocab_size index_l = index_f + per_partition_vocab_size return index_f + offset, index_l + offset -def vocab_range_from_global_vocab_size(global_vocab_size: int, - rank: int, - world_size: int, - offset: int = 0) -> Sequence[int]: +def vocab_range_from_global_vocab_size( + global_vocab_size: int, rank: int, world_size: int, offset: int = 0 +) -> Sequence[int]: per_partition_vocab_size = divide(global_vocab_size, world_size) - return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, - rank, - offset=offset) + return vocab_range_from_per_partition_vocab_size( + per_partition_vocab_size, rank, offset=offset + ) @dataclass class VocabParallelEmbeddingShardIndices: """Indices for a shard of a vocab parallel embedding.""" + padded_org_vocab_start_index: int padded_org_vocab_end_index: int padded_added_vocab_start_index: int @@ -100,13 +108,11 @@ def num_added_elements(self) -> int: @property def num_org_elements_padded(self) -> int: - return (self.padded_org_vocab_end_index - - self.padded_org_vocab_start_index) + return self.padded_org_vocab_end_index - self.padded_org_vocab_start_index @property def num_added_elements_padded(self) -> int: - return (self.padded_added_vocab_end_index - - self.padded_added_vocab_start_index) + return self.padded_added_vocab_end_index - self.padded_added_vocab_start_index @property def num_org_vocab_padding(self) -> int: @@ -122,17 +128,14 @@ def num_elements_padded(self) -> int: def __post_init__(self): # sanity checks - assert (self.padded_org_vocab_start_index <= - self.padded_org_vocab_end_index) - assert (self.padded_added_vocab_start_index <= - self.padded_added_vocab_end_index) + assert self.padded_org_vocab_start_index <= self.padded_org_vocab_end_index + assert self.padded_added_vocab_start_index <= self.padded_added_vocab_end_index assert self.org_vocab_start_index <= self.org_vocab_end_index assert self.added_vocab_start_index <= self.added_vocab_end_index assert self.org_vocab_start_index <= self.padded_org_vocab_start_index - assert (self.added_vocab_start_index <= - self.padded_added_vocab_start_index) + assert self.added_vocab_start_index <= self.padded_added_vocab_start_index assert self.org_vocab_end_index <= self.padded_org_vocab_end_index assert self.added_vocab_end_index <= self.padded_added_vocab_end_index @@ -142,20 +145,27 @@ def __post_init__(self): @torch.jit.script def get_masked_input_and_mask( - input_: torch.Tensor, org_vocab_start_index: int, - org_vocab_end_index: int, num_org_vocab_padding: int, - added_vocab_start_index: int, - added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: + input_: torch.Tensor, + org_vocab_start_index: int, + org_vocab_end_index: int, + num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int, +) -> Tuple[torch.Tensor, torch.Tensor]: # torch.jit.script will fuse all of the pointwise ops below # into a single kernel, making it very fast - org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < - org_vocab_end_index) + org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) added_vocab_mask = (input_ >= added_vocab_start_index) & ( - input_ < added_vocab_end_index) - added_offset = added_vocab_start_index - ( - org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding - valid_offset = (org_vocab_start_index * - org_vocab_mask) + (added_offset * added_vocab_mask) + input_ < added_vocab_end_index + ) + added_offset = ( + added_vocab_start_index + - (org_vocab_end_index - org_vocab_start_index) + - num_org_vocab_padding + ) + valid_offset = (org_vocab_start_index * org_vocab_mask) + ( + added_offset * added_vocab_mask + ) vocab_mask = org_vocab_mask | added_vocab_mask input_ = vocab_mask * (input_ - valid_offset) return input_, ~vocab_mask @@ -200,15 +210,17 @@ class VocabParallelEmbedding(torch.nn.Module): prefix: full name of the layer in the state dict """ # noqa: E501 - def __init__(self, - num_embeddings: int, - embedding_dim: int, - params_dtype: Optional[torch.dtype] = None, - org_num_embeddings: Optional[int] = None, - padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - enable_tp: bool = True): + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + params_dtype: Optional[torch.dtype] = None, + org_num_embeddings: Optional[int] = None, + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + enable_tp: bool = True, + ): super().__init__() self.enable_tp = enable_tp @@ -223,18 +235,22 @@ def __init__(self, self.padding_size = padding_size self.org_vocab_size = org_num_embeddings or num_embeddings num_added_embeddings = num_embeddings - self.org_vocab_size - self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size, - self.padding_size) + self.org_vocab_size_padded = pad_vocab_size( + self.org_vocab_size, self.padding_size + ) self.num_embeddings_padded = pad_vocab_size( - self.org_vocab_size_padded + num_added_embeddings, - self.padding_size) + self.org_vocab_size_padded + num_added_embeddings, self.padding_size + ) assert self.org_vocab_size_padded <= self.num_embeddings_padded - self.shard_indices = self._get_indices(self.num_embeddings_padded, - self.org_vocab_size_padded, - self.num_embeddings, - self.org_vocab_size, tp_rank, - self.tp_size) + self.shard_indices = self._get_indices( + self.num_embeddings_padded, + self.org_vocab_size_padded, + self.num_embeddings, + self.org_vocab_size, + tp_rank, + self.tp_size, + ) self.embedding_dim = embedding_dim linear_method = None @@ -248,11 +264,13 @@ def __init__(self, # layer type like ParallelLMHead, this is not important. is_embedding_layer = type(self.__class__) is VocabParallelEmbedding linear_method_implements_embedding = method_has_implemented_embedding( - type(linear_method)) + type(linear_method) + ) if is_embedding_layer and not linear_method_implements_embedding: raise NotImplementedError( f"The class {type(linear_method).__name__} must implement " - "the 'embedding' method, see UnquantizedEmbeddingMethod.") + "the 'embedding' method, see UnquantizedEmbeddingMethod." + ) self.linear_method: QuantizeMethodBase = linear_method @@ -260,53 +278,68 @@ def __init__(self, params_dtype = torch.get_default_dtype() # Divide the weight matrix along the vocaburaly dimension. self.num_added_embeddings = self.num_embeddings - self.org_vocab_size - self.num_embeddings_per_partition = divide(self.num_embeddings_padded, - self.tp_size) - assert (self.shard_indices.num_elements_padded == - self.num_embeddings_per_partition) + self.num_embeddings_per_partition = divide( + self.num_embeddings_padded, self.tp_size + ) + assert ( + self.shard_indices.num_elements_padded == self.num_embeddings_per_partition + ) self.num_org_embeddings_per_partition = ( - self.shard_indices.org_vocab_end_index - - self.shard_indices.org_vocab_start_index) + self.shard_indices.org_vocab_end_index + - self.shard_indices.org_vocab_start_index + ) self.num_added_embeddings_per_partition = ( - self.shard_indices.added_vocab_end_index - - self.shard_indices.added_vocab_start_index) - - self.linear_method.create_weights(self, - self.embedding_dim, - [self.num_embeddings_per_partition], - self.embedding_dim, - self.num_embeddings_padded, - params_dtype=params_dtype, - weight_loader=self.weight_loader) + self.shard_indices.added_vocab_end_index + - self.shard_indices.added_vocab_start_index + ) + + self.linear_method.create_weights( + self, + self.embedding_dim, + [self.num_embeddings_per_partition], + self.embedding_dim, + self.num_embeddings_padded, + params_dtype=params_dtype, + weight_loader=self.weight_loader, + ) @classmethod - def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int, - vocab_size: int, org_vocab_size: int, tp_rank: int, - tp_size: int) -> VocabParallelEmbeddingShardIndices: + def _get_indices( + cls, + vocab_size_padded: int, + org_vocab_size_padded: int, + vocab_size: int, + org_vocab_size: int, + tp_rank: int, + tp_size: int, + ) -> VocabParallelEmbeddingShardIndices: """Get start and end indices for vocab parallel embedding, following the layout outlined in the class docstring, based on the given tp_rank and tp_size.""" num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded padded_org_vocab_start_index, padded_org_vocab_end_index = ( - vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, - tp_size)) + vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) + ) padded_added_vocab_start_index, padded_added_vocab_end_index = ( - vocab_range_from_global_vocab_size(num_added_embeddings_padded, - tp_rank, - tp_size, - offset=org_vocab_size)) + vocab_range_from_global_vocab_size( + num_added_embeddings_padded, tp_rank, tp_size, offset=org_vocab_size + ) + ) # remove padding - org_vocab_start_index = min(padded_org_vocab_start_index, - org_vocab_size) + org_vocab_start_index = min(padded_org_vocab_start_index, org_vocab_size) org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size) - added_vocab_start_index = min(padded_added_vocab_start_index, - vocab_size) + added_vocab_start_index = min(padded_added_vocab_start_index, vocab_size) added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size) return VocabParallelEmbeddingShardIndices( - padded_org_vocab_start_index, padded_org_vocab_end_index, - padded_added_vocab_start_index, padded_added_vocab_end_index, - org_vocab_start_index, org_vocab_end_index, - added_vocab_start_index, added_vocab_end_index) + padded_org_vocab_start_index, + padded_org_vocab_end_index, + padded_added_vocab_start_index, + padded_added_vocab_end_index, + org_vocab_start_index, + org_vocab_end_index, + added_vocab_start_index, + added_vocab_end_index, + ) def get_sharded_to_full_mapping(self) -> Optional[List[int]]: """Get a mapping that can be used to reindex the gathered @@ -326,32 +359,49 @@ def get_sharded_to_full_mapping(self) -> Optional[List[int]]: added_embeddings: List[int] = [] padding: List[int] = [] for tp_rank in range(self.tp_size): - shard_indices = self._get_indices(self.num_embeddings_padded, - self.org_vocab_size_padded, - self.num_embeddings, - self.org_vocab_size, tp_rank, - self.tp_size) + shard_indices = self._get_indices( + self.num_embeddings_padded, + self.org_vocab_size_padded, + self.num_embeddings, + self.org_vocab_size, + tp_rank, + self.tp_size, + ) range_start = self.num_embeddings_per_partition * tp_rank range_end = self.num_embeddings_per_partition * (tp_rank + 1) base_embeddings.extend( - range(range_start, - range_start + shard_indices.num_org_elements)) + range(range_start, range_start + shard_indices.num_org_elements) + ) padding.extend( - range(range_start + shard_indices.num_org_elements, - range_start + shard_indices.num_org_elements_padded)) + range( + range_start + shard_indices.num_org_elements, + range_start + shard_indices.num_org_elements_padded, + ) + ) added_embeddings.extend( range( range_start + shard_indices.num_org_elements_padded, - range_start + shard_indices.num_org_elements_padded + - shard_indices.num_added_elements)) + range_start + + shard_indices.num_org_elements_padded + + shard_indices.num_added_elements, + ) + ) padding.extend( range( - range_start + shard_indices.num_org_elements_padded + - shard_indices.num_added_elements, - range_start + shard_indices.num_org_elements_padded + - shard_indices.num_added_elements_padded)) - assert (range_start + shard_indices.num_org_elements_padded + - shard_indices.num_added_elements_padded == range_end) + range_start + + shard_indices.num_org_elements_padded + + shard_indices.num_added_elements, + range_start + + shard_indices.num_org_elements_padded + + shard_indices.num_added_elements_padded, + ) + ) + assert ( + range_start + + shard_indices.num_org_elements_padded + + shard_indices.num_added_elements_padded + == range_end + ) ret = base_embeddings + added_embeddings + padding assert len(ret) == self.num_embeddings_padded return ret @@ -385,10 +435,14 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # If param packed on the same dim we are sharding on, then # need to adjust offsets of loaded weight by pack_factor. if packed_dim is not None and packed_dim == output_dim: - packed_factor = param.packed_factor if isinstance( - param, BasevLLMParameter) else param.pack_factor - assert loaded_weight.shape[output_dim] == (self.org_vocab_size // - param.packed_factor) + packed_factor = ( + param.packed_factor + if isinstance(param, BasevLLMParameter) + else param.pack_factor + ) + assert loaded_weight.shape[output_dim] == ( + self.org_vocab_size // param.packed_factor + ) start_idx = start_idx // packed_factor shard_size = shard_size // packed_factor else: @@ -396,23 +450,24 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # Copy the data. loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - param[:loaded_weight.shape[0]].data.copy_(loaded_weight) - param[loaded_weight.shape[0]:].data.fill_(0) + param[: loaded_weight.shape[0]].data.copy_(loaded_weight) + param[loaded_weight.shape[0] :].data.fill_(0) def forward(self, input_): if self.tp_size > 1: # Build the mask. masked_input, input_mask = get_masked_input_and_mask( - input_, self.shard_indices.org_vocab_start_index, + input_, + self.shard_indices.org_vocab_start_index, self.shard_indices.org_vocab_end_index, self.shard_indices.num_org_vocab_padding, self.shard_indices.added_vocab_start_index, - self.shard_indices.added_vocab_end_index) + self.shard_indices.added_vocab_end_index, + ) else: masked_input = input_ # Get the embeddings. - output_parallel = self.linear_method.embedding(self, - masked_input.long()) + output_parallel = self.linear_method.embedding(self, masked_input.long()) # Mask the output embedding. if self.tp_size > 1: output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) @@ -426,9 +481,9 @@ def extra_repr(self) -> str: s = f"num_embeddings={self.num_embeddings_per_partition}" s += f", embedding_dim={self.embedding_dim}" s += f", org_vocab_size={self.org_vocab_size}" - s += f', num_embeddings_padded={self.num_embeddings_padded}' + s += f", num_embeddings_padded={self.num_embeddings_padded}" if self.enable_tp: - s += f', tp_size={self.tp_size}' + s += f", tp_size={self.tp_size}" return s @@ -448,27 +503,38 @@ class ParallelLMHead(VocabParallelEmbedding): padding_size: padding size for the vocabulary. """ - def __init__(self, - num_embeddings: int, - embedding_dim: int, - bias: bool = False, - params_dtype: Optional[torch.dtype] = None, - org_num_embeddings: Optional[int] = None, - padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): - super().__init__(num_embeddings, embedding_dim, params_dtype, - org_num_embeddings, padding_size, quant_config, - prefix) + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + bias: bool = False, + params_dtype: Optional[torch.dtype] = None, + org_num_embeddings: Optional[int] = None, + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__( + num_embeddings, + embedding_dim, + params_dtype, + org_num_embeddings, + padding_size, + quant_config, + prefix, + ) self.quant_config = quant_config if bias: self.bias = Parameter( - torch.empty(self.num_embeddings_per_partition, - dtype=params_dtype)) - set_weight_attrs(self.bias, { - "output_dim": 0, - "weight_loader": self.weight_loader, - }) + torch.empty(self.num_embeddings_per_partition, dtype=params_dtype) + ) + set_weight_attrs( + self.bias, + { + "output_dim": 0, + "weight_loader": self.weight_loader, + }, + ) else: self.register_parameter("bias", None) diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 9c5ed14f39..b6555183bd 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -86,8 +86,10 @@ def normalize_batch_and_arguments(self): self.parallel_sample_num = self.sampling_params.get("n", 1) else: # isinstance(self.sampling_params, list): self.parallel_sample_num = self.sampling_params[0].get("n", 1) - assert all(self.parallel_sample_num == sampling_params.get("n", 1) for sampling_params in self.sampling_params), ( - "The parallel_sample_num should be the same for all samples in sample params.") + assert all( + self.parallel_sample_num == sampling_params.get("n", 1) + for sampling_params in self.sampling_params + ), "The parallel_sample_num should be the same for all samples in sample params." if self.parallel_sample_num > 1 and self.is_single: self.is_single = False diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 742b91398f..79fe1cf9f8 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -911,8 +911,7 @@ def filter_batch( keep_indices = [ i for i in range(len(self.reqs)) - if not self.reqs[i].finished() - and self.reqs[i] is not being_chunked_req + if not self.reqs[i].finished() and self.reqs[i] is not being_chunked_req ] if keep_indices is None or len(keep_indices) == 0: @@ -1043,6 +1042,7 @@ def mark_reqs_started(self): for req in self.reqs: req.started_time = time.time() + @dataclasses.dataclass class ModelWorkerBatch: # The batch id diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index f7933e0acd..f0d191a29c 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -224,8 +224,8 @@ def __init__( self.forward_ct = 0 self.forward_ct_decode = 0 self.num_generated_tokens = 0 - self.last_stats_tic = time.time() # time of last stats for every iter - self.last_log_tic = time.time() # time of last log for print decode log + self.last_stats_tic = time.time() # time of last stats for every iter + self.last_log_tic = time.time() # time of last log for print decode log self.stream_interval = server_args.stream_interval # Init chunked prefill @@ -566,9 +566,7 @@ def get_next_batch_to_run(self): and not self.last_batch.is_empty() ): if self.being_chunked_req: - self.last_batch.filter_batch( - being_chunked_req=self.being_chunked_req - ) + self.last_batch.filter_batch(being_chunked_req=self.being_chunked_req) self.tree_cache.cache_unfinished_req(self.being_chunked_req) # Inflight request keeps its rid but will get a new req_pool_idx. self.req_to_token_pool.free(self.being_chunked_req.req_pool_idx) @@ -628,9 +626,7 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]: has_inflight = self.being_chunked_req is not None if has_inflight: self.being_chunked_req.init_next_round_input() - self.being_chunked_req = adder.add_inflight_req( - self.being_chunked_req - ) + self.being_chunked_req = adder.add_inflight_req(self.being_chunked_req) if self.lora_paths: lora_set = ( @@ -813,7 +809,8 @@ def run_batch(self, batch: ScheduleBatch): embeddings = self.tp_worker.forward_batch_embedding(model_worker_batch) ret = embeddings, model_worker_batch.bid return ret - def get_stats(self,batch: ScheduleBatch): + + def get_stats(self, batch: ScheduleBatch): # TODO: get stats for chunked prefill now = time.time() @@ -829,8 +826,8 @@ def get_stats(self,batch: ScheduleBatch): # set stats from prefill if self.stats is not None: # new_seq=self.stats.new_seq - cache_hit_rate=self.stats.cache_hit_rate - token_usage=self.stats.token_usage + cache_hit_rate = self.stats.cache_hit_rate + token_usage = self.stats.token_usage # Iteration stats num_prompt_tokens_iter = 0 num_generation_tokens_iter = 0 @@ -851,15 +848,19 @@ def get_stats(self,batch: ScheduleBatch): # _, next_token_ids, _ = result if batch is not None: num_generation_tokens_iter = len(batch.output_ids) - gen_throughput = round(num_generation_tokens_iter / (now - self.last_stats_tic), 2) + gen_throughput = round( + num_generation_tokens_iter / (now - self.last_stats_tic), 2 + ) for i, req in enumerate(batch.reqs): # NOTE: Batch forward mode is extend befor start decode, if batch.forward_mode.is_extend(): - num_prompt_tokens_iter=len(batch.input_ids)+sum(batch.prefix_lens) + num_prompt_tokens_iter = len(batch.input_ids) + sum( + batch.prefix_lens + ) time_to_first_tokens_iter.append(now - req.started_time) else: - time_per_output_tokens_iter.append(now-self.last_stats_tic) + time_per_output_tokens_iter.append(now - self.last_stats_tic) if req.finished(): time_e2e_requests.append(now - req.created_time) @@ -867,9 +868,10 @@ def get_stats(self,batch: ScheduleBatch): num_prompt_tokens_requests.append(len(req.origin_input_ids)) num_generation_tokens_requests.append(len(req.output_ids)) finished_reason_requests.append( - req.finished_reason.to_json() - if req.finished_reason is not None - else None) + req.finished_reason.to_json() + if req.finished_reason is not None + else None + ) return Stats( new_seq=new_seq, @@ -893,7 +895,7 @@ def get_stats(self,batch: ScheduleBatch): max_running_requests=self.max_running_requests, ) - def log_stats(self,stats:Stats): + def log_stats(self, stats: Stats): self.metrics_collector.log_stats(stats) def process_batch_result(self, batch: ScheduleBatch, result): @@ -1003,9 +1005,7 @@ def process_batch_result_decode(self, batch: ScheduleBatch, result): if req.is_retracted: continue - if self.server_args.enable_overlap_schedule and ( - req.finished() - ): + if self.server_args.enable_overlap_schedule and (req.finished()): self.token_to_kv_pool.free(batch.out_cache_loc[i : i + 1]) continue @@ -1031,7 +1031,10 @@ def process_batch_result_decode(self, batch: ScheduleBatch, result): self.token_to_kv_pool.free_group_end() self.forward_ct_decode = (self.forward_ct_decode + 1) % (1 << 30) - if self.tp_rank == 0 and self.forward_ct_decode % self.server_args.decode_log_interval == 0: + if ( + self.tp_rank == 0 + and self.forward_ct_decode % self.server_args.decode_log_interval == 0 + ): self.print_decode_stats() def add_logprob_return_values( diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 60cfc1be1e..78f35903f4 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -215,7 +215,7 @@ async def _tokenize_one_request( logprob_start_len, top_logprobs_num, obj.stream, - obj.lora_path + obj.lora_path, ) elif isinstance(obj, EmbeddingReqInput): tokenized_obj = TokenizedEmbeddingReqInput( @@ -290,7 +290,9 @@ async def _handle_batch_request( # Tokenize all requests objs = [obj[i] for i in range(batch_size)] - tokenized_objs = await asyncio.gather(*(self._tokenize_one_request(obj) for obj in objs)) + tokenized_objs = await asyncio.gather( + *(self._tokenize_one_request(obj) for obj in objs) + ) # Cache the common prefix for parallel sampling for i in range(batch_size): @@ -322,7 +324,9 @@ async def _handle_batch_request( rid_to_index = {rid: i for i, rid in enumerate(rids)} task_map = {asyncio.create_task(gen.__anext__()): gen for gen in generators} while task_map: - done, _ = await asyncio.wait(task_map.keys(), return_when=asyncio.FIRST_COMPLETED) + done, _ = await asyncio.wait( + task_map.keys(), return_when=asyncio.FIRST_COMPLETED + ) for task in done: gen = task_map.pop(task) @@ -367,7 +371,7 @@ async def get_memory_pool_size(self): if self.server_args.dp_size == 1: res = await self.mem_pool_size return res.size - else: # self.server_args.dp_size > 1 + else: # self.server_args.dp_size > 1 self.mem_pool_size_tmp = [] res = await self.mem_pool_size ret = [r.size for r in res] @@ -399,7 +403,7 @@ async def update_weights( self.server_args.load_format = obj.load_format self.model_path = obj.model_path return result.success, result.message - else: # self.server_args.dp_size > 1 + else: # self.server_args.dp_size > 1 self.model_update_tmp = [] result = await self.model_update_result @@ -470,7 +474,7 @@ async def handle_loop(self): if isinstance(recv_obj, UpdateWeightReqOutput): if self.server_args.dp_size == 1: self.model_update_result.set_result(recv_obj) - else: # self.server_args.dp_size > 1 + else: # self.server_args.dp_size > 1 self.model_update_tmp.append(recv_obj) # set future if the all results are recevied if len(self.model_update_tmp) == self.server_args.dp_size: @@ -479,7 +483,7 @@ async def handle_loop(self): elif isinstance(recv_obj, GetMemPoolSizeReqOutput): if self.server_args.dp_size == 1: self.mem_pool_size.set_result(recv_obj) - else: # self.sever_args.dp_size > 1 + else: # self.sever_args.dp_size > 1 self.mem_pool_size_tmp.append(recv_obj) # set future if the all results are received if len(self.mem_pool_size_tmp) == self.server_args.dp_size: diff --git a/python/sglang/srt/metrics/metrics_collector.py b/python/sglang/srt/metrics/metrics_collector.py index df7d6961da..91a8494142 100644 --- a/python/sglang/srt/metrics/metrics_collector.py +++ b/python/sglang/srt/metrics/metrics_collector.py @@ -130,27 +130,65 @@ def __init__(self, labelnames: List[str], max_model_len): self.counter_prompt_tokens = Counter( name="sglang:prompt_tokens_total", documentation="Number of prefill tokens processed.", - labelnames=labelnames) + labelnames=labelnames, + ) self.counter_generation_tokens = Counter( name="sglang:generation_tokens_total", documentation="Number of generation tokens processed.", - labelnames=labelnames) + labelnames=labelnames, + ) self.histogram_time_to_first_token = Histogram( name="sglang:time_to_first_token_seconds", documentation="Histogram of time to first token in seconds.", labelnames=labelnames, buckets=[ - 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 15.0, 20.0, 25.0, 30.0 - ]) + 0.001, + 0.005, + 0.01, + 0.02, + 0.04, + 0.06, + 0.08, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 15.0, + 20.0, + 25.0, + 30.0, + ], + ) self.histogram_time_per_output_token = Histogram( name="sglang:time_per_output_token_seconds", documentation="Histogram of time per output token in seconds.", labelnames=labelnames, buckets=[ - 0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, - 1.0, 2.5 - ]) + 0.005, + 0.01, + 0.015, + 0.02, + 0.025, + 0.03, + 0.04, + 0.05, + 0.075, + 0.1, + 0.15, + 0.2, + 0.3, + 0.4, + 0.5, + 0.75, + 1.0, + 2.5, + ], + ) # Request Stats # Metadata @@ -245,14 +283,19 @@ def log_stats(self, stats: Stats) -> None: stats.num_generation_tokens_requests, ) - self._log_counter(self.metrics.counter_prompt_tokens, - stats.num_prompt_tokens_iter) - self._log_counter(self.metrics.counter_generation_tokens, - stats.num_generation_tokens_iter) - self._log_histogram(self.metrics.histogram_time_to_first_token, - stats.time_to_first_tokens_iter) - self._log_histogram(self.metrics.histogram_time_per_output_token, - stats.time_per_output_tokens_iter) + self._log_counter( + self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter + ) + self._log_counter( + self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter + ) + self._log_histogram( + self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter + ) + self._log_histogram( + self.metrics.histogram_time_per_output_token, + stats.time_per_output_tokens_iter, + ) # self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys) self._log_gauge(self.metrics.num_running_sys, stats.num_running_req) diff --git a/python/sglang/srt/models/gpt2.py b/python/sglang/srt/models/gpt2.py index 3495f24d05..8d988fe8ea 100644 --- a/python/sglang/srt/models/gpt2.py +++ b/python/sglang/srt/models/gpt2.py @@ -28,7 +28,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.model_loader.weight_utils import default_weight_loader -#from sglang.srt.layers.activation import get_act_fn +# from sglang.srt.layers.activation import get_act_fn from sglang.srt.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -47,15 +47,14 @@ def __init__( self, layer_id: int, config: GPT2Config, - cache_config = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): super().__init__() self.hidden_size = config.hidden_size total_num_heads = config.num_attention_heads - tensor_model_parallel_world_size = ( - get_tensor_model_parallel_world_size()) + tensor_model_parallel_world_size = get_tensor_model_parallel_world_size() assert total_num_heads % tensor_model_parallel_world_size == 0 self.num_heads = total_num_heads // tensor_model_parallel_world_size self.head_dim = self.hidden_size // total_num_heads @@ -76,11 +75,13 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.c_proj", ) - self.attn = RadixAttention(self.num_heads, - self.head_dim, - scaling=self.scale, - num_kv_heads=total_num_heads, - layer_id=layer_id) + self.attn = RadixAttention( + self.num_heads, + self.head_dim, + scaling=self.scale, + num_kv_heads=total_num_heads, + layer_id=layer_id, + ) def forward( self, @@ -119,10 +120,14 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.c_proj", ) - self.act = get_act_fn(config.activation_function, quant_config, - intermediate_size) + self.act = get_act_fn( + config.activation_function, quant_config, intermediate_size + ) - def forward(self, hidden_states: torch.Tensor,) -> torch.Tensor: + def forward( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor: hidden_states, _ = self.c_fc(hidden_states) hidden_states = self.act(hidden_states) hidden_states, _ = self.c_proj(hidden_states) @@ -135,27 +140,20 @@ def __init__( self, layer_id: int, config: GPT2Config, - cache_config = None, - + cache_config=None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): super().__init__() hidden_size = config.hidden_size - inner_dim = (config.n_inner if config.n_inner is not None else 4 * - hidden_size) + inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.attn = GPT2Attention(layer_id, - config, - cache_config, - quant_config, - prefix=f"{prefix}.attn") + self.attn = GPT2Attention( + layer_id, config, cache_config, quant_config, prefix=f"{prefix}.attn" + ) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.mlp = GPT2MLP(inner_dim, - config, - quant_config, - prefix=f"{prefix}.mlp") + self.mlp = GPT2MLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") def forward( self, @@ -179,13 +177,12 @@ def forward( return hidden_states - class GPT2Model(nn.Module): def __init__( self, config: GPT2Config, - cache_config = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): @@ -229,16 +226,15 @@ class GPT2LMHeadModel(nn.Module): def __init__( self, config: GPT2Config, - cache_config = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() self.config = config self.quant_config = quant_config - self.transformer = GPT2Model(config, - cache_config, - quant_config, - prefix="transformer") + self.transformer = GPT2Model( + config, cache_config, quant_config, prefix="transformer" + ) self.lm_head = self.transformer.wte self.logits_processor = LogitsProcessor(config) @@ -254,8 +250,6 @@ def forward( input_ids, hidden_states, self.lm_head.weight, forward_batch ) - - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters(remove_duplicate=False)) for name, loaded_weight in weights: @@ -280,8 +274,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if not name.endswith(".weight"): continue loaded_weight = loaded_weight.t() - weight_loader = getattr(param, "weight_loader", - default_weight_loader) + weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + EntryClass = GPT2LMHeadModel diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 895af0e699..1ed8af0e70 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -419,6 +419,7 @@ def launch_engine( for i in range(len(scheduler_pipe_readers)): scheduler_pipe_readers[i].recv() + def add_prometheus_middleware(app: FastAPI): # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.1/vllm/entrypoints/openai/api_server.py#L216 from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess @@ -490,6 +491,7 @@ def launch_server( finally: t.join() + def _set_prometheus_env(): # Set prometheus multiprocess directory # sglang uses prometheus multiprocess mode @@ -506,6 +508,7 @@ def _set_prometheus_env(): os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name logger.debug(f"PROMETHEUS_MULTIPROC_DIR: {os.environ['PROMETHEUS_MULTIPROC_DIR']}") + def _set_envs_and_config(server_args: ServerArgs): # Set global environments os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" @@ -763,8 +766,8 @@ def __init__(self, *args, **kwargs): # runtime server default log level is log # offline engine works in scripts, so we set it to error - if 'log_level' not in kwargs: - kwargs['log_level'] = 'error' + if "log_level" not in kwargs: + kwargs["log_level"] = "error" server_args = ServerArgs(*args, **kwargs) launch_engine(server_args=server_args) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 84d1afbd5f..53a493bdea 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -448,7 +448,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "--decode-log-interval", type=int, default=ServerArgs.decode_log_interval, - help="The log interval of decode batch" + help="The log interval of decode batch", ) # Data parallelism diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 8a486131f0..2c68a22b4d 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -742,7 +742,13 @@ def workload_func(base_url, model): finally: pass - run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size) + run_and_check_memory_leak( + workload_func, + disable_radix_cache, + enable_mixed_chunk, + enable_overlap, + chunked_prefill_size, + ) def run_mulit_request_test( @@ -775,4 +781,10 @@ def run_one(_): with ThreadPoolExecutor(2) as executor: list(executor.map(run_one, list(range(4)))) - run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size) + run_and_check_memory_leak( + workload_func, + disable_radix_cache, + enable_mixed_chunk, + enable_overlap, + chunked_prefill_size, + ) diff --git a/python/sglang/utils.py b/python/sglang/utils.py index 9c1fc67950..e694dc198d 100644 --- a/python/sglang/utils.py +++ b/python/sglang/utils.py @@ -349,6 +349,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None: def terminate_process(process): from sglang.srt.utils import kill_child_process + kill_child_process(process.pid, include_self=True) diff --git a/rust/test_bindings.py b/rust/test_bindings.py index d81e1451f6..c4ecfe3c60 100644 --- a/rust/test_bindings.py +++ b/rust/test_bindings.py @@ -11,7 +11,7 @@ "http://localhost:30000", "http://localhost:30002", ], - policy="random" + policy="random", ) # Start the router - this will block and run the server diff --git a/scripts/playground/reference_hf.py b/scripts/playground/reference_hf.py index 3f5fe2024a..bf56fc3c98 100644 --- a/scripts/playground/reference_hf.py +++ b/scripts/playground/reference_hf.py @@ -104,15 +104,9 @@ def synthetic_tokens(args): default="TinyLlama/TinyLlama-1.1B-Chat-v0.4", # default="meta-llama/Llama-2-7b-chat-hf", ) - parser.add_argument( - "--max-new-tokens", - type=int, - default=16) + parser.add_argument("--max-new-tokens", type=int, default=16) - parser.add_argument( - "--dtype", - type=str, - default="float16") + parser.add_argument("--dtype", type=str, default="float16") args = parser.parse_args() diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py index b4c2cde2de..4e3f051e3d 100644 --- a/test/srt/models/test_generation_models.py +++ b/test/srt/models/test_generation_models.py @@ -56,7 +56,7 @@ class ModelCase: ModelCase("HuggingFaceTB/SmolLM-135M-Instruct", skip_long_prompt=True), ModelCase("allenai/OLMo-1B-0724-hf", decode_tolerance=8e-2, skip_long_prompt=True), ModelCase("THUDM/glm-4-9b-chat"), - ModelCase("openai-community/gpt2") + ModelCase("openai-community/gpt2"), ] TORCH_DTYPES = [torch.float16] diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index d6ae76b8ab..070a0633ce 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -3,6 +3,7 @@ python3 -m unittest test_openai_server.TestOpenAIServer.test_completion """ + import json import time import unittest diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py index 3631780da7..a95026e20d 100644 --- a/test/srt/test_skip_tokenizer_init.py +++ b/test/srt/test_skip_tokenizer_init.py @@ -1,6 +1,7 @@ """ python3 -m unittest test_skip_tokenizer_init.TestSkipTokenizerInit.test_parallel_sample """ + import json import unittest diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py index 38781b0e20..0bf46c7713 100644 --- a/test/srt/test_srt_engine.py +++ b/test/srt/test_srt_engine.py @@ -110,7 +110,6 @@ def test_4_gsm8k(self): def test_5_prompt_input_ids_consistency(self): prompt = "The capital of UK is" - model_path = DEFAULT_MODEL_NAME_FOR_TEST engine = sgl.Engine(model_path=model_path, random_seed=42, log_level="error") sampling_params = {"temperature": 0, "max_new_tokens": 8} @@ -118,7 +117,9 @@ def test_5_prompt_input_ids_consistency(self): tokenizer = get_tokenizer(model_path) token_ids = tokenizer.encode(prompt) - out2 = engine.generate(input_ids=token_ids, sampling_params=sampling_params)["text"] + out2 = engine.generate(input_ids=token_ids, sampling_params=sampling_params)[ + "text" + ] engine.shutdown() From 1ae270c5d0873c0bcd02b9078e3a6bd0f12fbc1d Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 7 Nov 2024 18:20:41 -0800 Subject: [PATCH 02/12] [Doc] fix docs (#1949) --- docs/{references => frontend}/choices_methods.md | 0 docs/index.rst | 4 ++-- docs/references/hyperparameter_tuning.md | 6 +++--- docs/references/troubleshooting.md | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) rename docs/{references => frontend}/choices_methods.md (100%) diff --git a/docs/references/choices_methods.md b/docs/frontend/choices_methods.md similarity index 100% rename from docs/references/choices_methods.md rename to docs/frontend/choices_methods.md diff --git a/docs/index.rst b/docs/index.rst index 130b298119..e81cdd1498 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -36,17 +36,17 @@ The core features include: :caption: Frontend Tutorial frontend/frontend.md + frontend/choices_methods.md .. toctree:: :maxdepth: 1 :caption: References + references/supported_models.md references/sampling_params.md references/hyperparameter_tuning.md - references/supported_models.md references/benchmark_and_profiling.md - references/choices_methods.md references/custom_chat_template.md references/contributor_guide.md references/troubleshooting.md diff --git a/docs/references/hyperparameter_tuning.md b/docs/references/hyperparameter_tuning.md index 89faa479be..499b81bc0c 100644 --- a/docs/references/hyperparameter_tuning.md +++ b/docs/references/hyperparameter_tuning.md @@ -26,9 +26,9 @@ Data parallelism is better for throughput. When there is enough GPU memory, alwa ### Avoid out-of-memory by Tuning `--chunked-prefill-size`, `--mem-fraction-static`, `--max-running-requests` If you see out of memory (OOM) errors, you can try to tune the following parameters. -If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`. -If OOM happens during decoding, try to decrease `--max-running-requests`. -You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding. +- If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`. +- If OOM happens during decoding, try to decrease `--max-running-requests`. +- You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding. ### Try Advanced Options - To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly. diff --git a/docs/references/troubleshooting.md b/docs/references/troubleshooting.md index becb186df7..8442bb2050 100644 --- a/docs/references/troubleshooting.md +++ b/docs/references/troubleshooting.md @@ -4,9 +4,9 @@ This page lists some common errors and tips for fixing them. ## CUDA out of memory If you see out of memory (OOM) errors, you can try to tune the following parameters. -If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`. -If OOM happens during decoding, try to decrease `--max-running-requests`. -You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding. +- If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`. +- If OOM happens during decoding, try to decrease `--max-running-requests`. +- You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding. ## CUDA error: an illegal memory access was encountered This error may be due to kernel errors or out-of-memory issues. From 67c424cce310d36b7261992ebce00bd218378769 Mon Sep 17 00:00:00 2001 From: HAI Date: Thu, 7 Nov 2024 18:24:02 -0800 Subject: [PATCH 03/12] [Performance, Triton Kernel Args] extend_attention, optimize kern args to _fwd_kernel (#1941) --- .../srt/layers/attention/triton_ops/extend_attention.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py index 52a72d7fea..8c588bd9ce 100644 --- a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py +++ b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py @@ -25,6 +25,7 @@ from sglang.srt.layers.attention.triton_ops.prefill_attention import ( context_attention_fwd, ) +from sglang.srt.utils import is_hip is_cuda_available = torch.cuda.is_available() if is_cuda_available: @@ -311,6 +312,10 @@ def extend_attention_fwd( num_warps = 4 if Lk <= 64 else 8 num_stages = 1 + extra_kargs = {} + if is_hip(): + extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2} + _fwd_kernel[grid]( q_extend, k_extend, @@ -348,6 +353,7 @@ def extend_attention_fwd( Lv=Lv, num_warps=num_warps, num_stages=num_stages, + **extra_kargs, ) From d32fba2a4d4cee32d4ba25bb4f04c765fd7f1b9a Mon Sep 17 00:00:00 2001 From: HAI Date: Thu, 7 Nov 2024 18:24:36 -0800 Subject: [PATCH 04/12] [ENV, ROCm] update environment settings (#1939) --- docker/Dockerfile.rocm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 42b3135955..cbbfd47e65 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -8,7 +8,7 @@ FROM $BASE_IMAGE AS base USER root WORKDIR /sgl-workspace - +ARG BUILD_TYPE=all ARG SGL_REPO="https://github.com/sgl-project/sglang" ENV SGL_DEFAULT="main" ARG SGL_BRANCH=${SGL_DEFAULT} @@ -41,5 +41,7 @@ ENV VLLM_FP8_PADDING=1 ENV VLLM_FP8_ACT_PADDING=1 ENV VLLM_FP8_WEIGHT_PADDING=1 ENV VLLM_FP8_REDUCE_CONV=1 +ENV TORCHINDUCTOR_MAX_AUTOTUNE=1 +ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1 CMD ["/bin/bash"] From 691808d587deff22bfa7f8209a7122564514ea7d Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 7 Nov 2024 18:28:29 -0800 Subject: [PATCH 05/12] Add a timeout for execute-notebook.yml (#1951) --- .github/workflows/execute-notebook.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml index 170545d72c..e03edd6ce7 100644 --- a/.github/workflows/execute-notebook.yml +++ b/.github/workflows/execute-notebook.yml @@ -42,6 +42,7 @@ jobs: python -m ipykernel install --user --name python3 --display-name "Python 3" - name: Execute notebooks + timeout-minutes: 30 run: | cd docs make clean From a71a44f20369384c986a99836af25d1b302653af Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 7 Nov 2024 19:20:47 -0800 Subject: [PATCH 06/12] Update setup_github_runner.md (#1952) --- docs/developer/setup_github_runner.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/developer/setup_github_runner.md b/docs/developer/setup_github_runner.md index c6218a0b78..8dad7b8c06 100644 --- a/docs/developer/setup_github_runner.md +++ b/docs/developer/setup_github_runner.md @@ -1,4 +1,4 @@ -# Set Up Self-hosted Runners for GitHub Action +# Set Up Self-Hosted Runners for GitHub Action ## Add a Runner @@ -9,9 +9,9 @@ You can mount a folder for the shared huggingface model weights cache. The comma ``` docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04 # Nvidia -docker run --shm-size 64g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash +docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash # AMD -docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 64g -it -v /tmp/huggingface:/hf_home henryx/haisgl:sgl0.3.1.post3_vllm0.6.0_triton3.0.0_rocm6.2.1 /bin/bash +docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home henryx/haisgl:sgl0.3.1.post3_vllm0.6.0_triton3.0.0_rocm6.2.1 /bin/bash ``` ### Step 2: Configure the runner by `config.sh` From 5bc2508b80a438dda141c757af5b443db65defe9 Mon Sep 17 00:00:00 2001 From: Yudi Xue <10211+binarycrayon@users.noreply.github.com> Date: Thu, 7 Nov 2024 22:14:16 -0800 Subject: [PATCH 07/12] Monitoring documentation (#1933) --- docs/references/production_metrics.md | 205 +++ examples/monitoring/docker-compose.yaml | 16 + examples/monitoring/grafana.json | 1720 +++++++++++++++++++++++ examples/monitoring/prometheus.yaml | 10 + 4 files changed, 1951 insertions(+) create mode 100644 docs/references/production_metrics.md create mode 100644 examples/monitoring/docker-compose.yaml create mode 100644 examples/monitoring/grafana.json create mode 100644 examples/monitoring/prometheus.yaml diff --git a/docs/references/production_metrics.md b/docs/references/production_metrics.md new file mode 100644 index 0000000000..36515f3d45 --- /dev/null +++ b/docs/references/production_metrics.md @@ -0,0 +1,205 @@ +# Production Metrics + +sglang exposes the following metrics via Prometheus. The metrics are namespaced by `$name` (the model name). + +An example of the monitoring dashboard is available in [examples/monitoring/grafana.json](../examples/monitoring/grafana.json). + +Here is an example of the metrics: + +``` +# HELP sglang:max_total_num_tokens Maximum total number of tokens +# TYPE sglang:max_total_num_tokens gauge +sglang:max_total_num_tokens{name="google/gemma-2-9b-it"} 161721.0 +# HELP sglang:max_prefill_tokens Maximum prefill tokens +# TYPE sglang:max_prefill_tokens gauge +sglang:max_prefill_tokens{name="google/gemma-2-9b-it"} 16384.0 +# HELP sglang:max_running_requests Maximum running requests +# TYPE sglang:max_running_requests gauge +sglang:max_running_requests{name="google/gemma-2-9b-it"} 4097.0 +# HELP sglang:context_len Context length +# TYPE sglang:context_len gauge +sglang:context_len{name="google/gemma-2-9b-it"} 8192.0 +# HELP sglang:prompt_tokens_total Number of prefill tokens processed. +# TYPE sglang:prompt_tokens_total counter +sglang:prompt_tokens_total{name="google/gemma-2-9b-it"} 506780.0 +# HELP sglang:generation_tokens_total Number of generation tokens processed. +# TYPE sglang:generation_tokens_total counter +sglang:generation_tokens_total{name="google/gemma-2-9b-it"} 424549.0 +# HELP sglang:num_requests_running Number of requests currently running on GPU +# TYPE sglang:num_requests_running gauge +sglang:num_requests_running{name="google/gemma-2-9b-it"} 0.0 +# HELP sglang:num_requests_waiting Number of requests waiting to be processed. +# TYPE sglang:num_requests_waiting gauge +sglang:num_requests_waiting{name="google/gemma-2-9b-it"} 0.0 +# HELP sglang:gen_throughput Gen token throughput (token/s) +# TYPE sglang:gen_throughput gauge +sglang:gen_throughput{name="google/gemma-2-9b-it"} 0.0 +# HELP sglang:token_usage Total token usage +# TYPE sglang:token_usage gauge +sglang:token_usage{name="google/gemma-2-9b-it"} 0.01 +# HELP sglang:new_seq Number of new sequences +# TYPE sglang:new_seq gauge +sglang:new_seq{name="google/gemma-2-9b-it"} 0.0 +# HELP sglang:new_token Number of new token +# TYPE sglang:new_token gauge +sglang:new_token{name="google/gemma-2-9b-it"} 0.0 +# HELP sglang:cached_token Number of cached token +# TYPE sglang:cached_token gauge +sglang:cached_token{name="google/gemma-2-9b-it"} 0.0 +# HELP sglang:cache_hit_rate Cache hit rate +# TYPE sglang:cache_hit_rate gauge +sglang:cache_hit_rate{name="google/gemma-2-9b-it"} 10.61 +# HELP sglang:queue_req Number of queued requests +# TYPE sglang:queue_req gauge +sglang:queue_req{name="google/gemma-2-9b-it"} 0.0 +# HELP sglang:time_to_first_token_seconds Histogram of time to first token in seconds. +# TYPE sglang:time_to_first_token_seconds histogram +sglang:time_to_first_token_seconds_sum{name="google/gemma-2-9b-it"} 656.0780844688416 +sglang:time_to_first_token_seconds_bucket{le="0.001",name="google/gemma-2-9b-it"} 0.0 +sglang:time_to_first_token_seconds_bucket{le="0.005",name="google/gemma-2-9b-it"} 0.0 +sglang:time_to_first_token_seconds_bucket{le="0.01",name="google/gemma-2-9b-it"} 0.0 +sglang:time_to_first_token_seconds_bucket{le="0.02",name="google/gemma-2-9b-it"} 0.0 +sglang:time_to_first_token_seconds_bucket{le="0.04",name="google/gemma-2-9b-it"} 207.0 +sglang:time_to_first_token_seconds_bucket{le="0.06",name="google/gemma-2-9b-it"} 456.0 +sglang:time_to_first_token_seconds_bucket{le="0.08",name="google/gemma-2-9b-it"} 598.0 +sglang:time_to_first_token_seconds_bucket{le="0.1",name="google/gemma-2-9b-it"} 707.0 +sglang:time_to_first_token_seconds_bucket{le="0.25",name="google/gemma-2-9b-it"} 1187.0 +sglang:time_to_first_token_seconds_bucket{le="0.5",name="google/gemma-2-9b-it"} 1350.0 +sglang:time_to_first_token_seconds_bucket{le="0.75",name="google/gemma-2-9b-it"} 2124.0 +sglang:time_to_first_token_seconds_bucket{le="1.0",name="google/gemma-2-9b-it"} 2124.0 +sglang:time_to_first_token_seconds_bucket{le="2.5",name="google/gemma-2-9b-it"} 2124.0 +sglang:time_to_first_token_seconds_bucket{le="5.0",name="google/gemma-2-9b-it"} 2124.0 +sglang:time_to_first_token_seconds_bucket{le="7.5",name="google/gemma-2-9b-it"} 2124.0 +sglang:time_to_first_token_seconds_bucket{le="10.0",name="google/gemma-2-9b-it"} 2124.0 +sglang:time_to_first_token_seconds_bucket{le="15.0",name="google/gemma-2-9b-it"} 2124.0 +sglang:time_to_first_token_seconds_bucket{le="20.0",name="google/gemma-2-9b-it"} 2124.0 +sglang:time_to_first_token_seconds_bucket{le="25.0",name="google/gemma-2-9b-it"} 2124.0 +sglang:time_to_first_token_seconds_bucket{le="30.0",name="google/gemma-2-9b-it"} 2124.0 +sglang:time_to_first_token_seconds_bucket{le="+Inf",name="google/gemma-2-9b-it"} 2124.0 +sglang:time_to_first_token_seconds_count{name="google/gemma-2-9b-it"} 2124.0 +# HELP sglang:time_per_output_token_seconds Histogram of time per output token in seconds. +# TYPE sglang:time_per_output_token_seconds histogram +sglang:time_per_output_token_seconds_sum{name="google/gemma-2-9b-it"} 29846.5393948555 +sglang:time_per_output_token_seconds_bucket{le="0.005",name="google/gemma-2-9b-it"} 0.0 +sglang:time_per_output_token_seconds_bucket{le="0.01",name="google/gemma-2-9b-it"} 0.0 +sglang:time_per_output_token_seconds_bucket{le="0.015",name="google/gemma-2-9b-it"} 0.0 +sglang:time_per_output_token_seconds_bucket{le="0.02",name="google/gemma-2-9b-it"} 9602.0 +sglang:time_per_output_token_seconds_bucket{le="0.025",name="google/gemma-2-9b-it"} 30060.0 +sglang:time_per_output_token_seconds_bucket{le="0.03",name="google/gemma-2-9b-it"} 39184.0 +sglang:time_per_output_token_seconds_bucket{le="0.04",name="google/gemma-2-9b-it"} 61387.0 +sglang:time_per_output_token_seconds_bucket{le="0.05",name="google/gemma-2-9b-it"} 78835.0 +sglang:time_per_output_token_seconds_bucket{le="0.075",name="google/gemma-2-9b-it"} 139394.0 +sglang:time_per_output_token_seconds_bucket{le="0.1",name="google/gemma-2-9b-it"} 422029.0 +sglang:time_per_output_token_seconds_bucket{le="0.15",name="google/gemma-2-9b-it"} 422029.0 +sglang:time_per_output_token_seconds_bucket{le="0.2",name="google/gemma-2-9b-it"} 422029.0 +sglang:time_per_output_token_seconds_bucket{le="0.3",name="google/gemma-2-9b-it"} 422424.0 +sglang:time_per_output_token_seconds_bucket{le="0.4",name="google/gemma-2-9b-it"} 422424.0 +sglang:time_per_output_token_seconds_bucket{le="0.5",name="google/gemma-2-9b-it"} 422425.0 +sglang:time_per_output_token_seconds_bucket{le="0.75",name="google/gemma-2-9b-it"} 422425.0 +sglang:time_per_output_token_seconds_bucket{le="1.0",name="google/gemma-2-9b-it"} 422425.0 +sglang:time_per_output_token_seconds_bucket{le="2.5",name="google/gemma-2-9b-it"} 422425.0 +sglang:time_per_output_token_seconds_bucket{le="+Inf",name="google/gemma-2-9b-it"} 422425.0 +sglang:time_per_output_token_seconds_count{name="google/gemma-2-9b-it"} 422425.0 +# HELP sglang:request_prompt_tokens Number of prefill tokens processed +# TYPE sglang:request_prompt_tokens histogram +sglang:request_prompt_tokens_sum{name="google/gemma-2-9b-it"} 500552.0 +sglang:request_prompt_tokens_bucket{le="1.0",name="google/gemma-2-9b-it"} 0.0 +sglang:request_prompt_tokens_bucket{le="2.0",name="google/gemma-2-9b-it"} 0.0 +sglang:request_prompt_tokens_bucket{le="5.0",name="google/gemma-2-9b-it"} 22.0 +sglang:request_prompt_tokens_bucket{le="10.0",name="google/gemma-2-9b-it"} 191.0 +sglang:request_prompt_tokens_bucket{le="20.0",name="google/gemma-2-9b-it"} 511.0 +sglang:request_prompt_tokens_bucket{le="50.0",name="google/gemma-2-9b-it"} 825.0 +sglang:request_prompt_tokens_bucket{le="100.0",name="google/gemma-2-9b-it"} 997.0 +sglang:request_prompt_tokens_bucket{le="200.0",name="google/gemma-2-9b-it"} 1182.0 +sglang:request_prompt_tokens_bucket{le="500.0",name="google/gemma-2-9b-it"} 1748.0 +sglang:request_prompt_tokens_bucket{le="1000.0",name="google/gemma-2-9b-it"} 2102.0 +sglang:request_prompt_tokens_bucket{le="2000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_prompt_tokens_bucket{le="5000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_prompt_tokens_bucket{le="10000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_prompt_tokens_bucket{le="20000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_prompt_tokens_bucket{le="50000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_prompt_tokens_bucket{le="100000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_prompt_tokens_bucket{le="+Inf",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_prompt_tokens_count{name="google/gemma-2-9b-it"} 2104.0 +# HELP sglang:request_generation_tokens Number of generation tokens processed. +# TYPE sglang:request_generation_tokens histogram +sglang:request_generation_tokens_sum{name="google/gemma-2-9b-it"} 424529.0 +sglang:request_generation_tokens_bucket{le="1.0",name="google/gemma-2-9b-it"} 0.0 +sglang:request_generation_tokens_bucket{le="2.0",name="google/gemma-2-9b-it"} 0.0 +sglang:request_generation_tokens_bucket{le="5.0",name="google/gemma-2-9b-it"} 49.0 +sglang:request_generation_tokens_bucket{le="10.0",name="google/gemma-2-9b-it"} 202.0 +sglang:request_generation_tokens_bucket{le="20.0",name="google/gemma-2-9b-it"} 448.0 +sglang:request_generation_tokens_bucket{le="50.0",name="google/gemma-2-9b-it"} 814.0 +sglang:request_generation_tokens_bucket{le="100.0",name="google/gemma-2-9b-it"} 979.0 +sglang:request_generation_tokens_bucket{le="200.0",name="google/gemma-2-9b-it"} 1266.0 +sglang:request_generation_tokens_bucket{le="500.0",name="google/gemma-2-9b-it"} 1883.0 +sglang:request_generation_tokens_bucket{le="1000.0",name="google/gemma-2-9b-it"} 2095.0 +sglang:request_generation_tokens_bucket{le="2000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_generation_tokens_bucket{le="5000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_generation_tokens_bucket{le="10000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_generation_tokens_bucket{le="20000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_generation_tokens_bucket{le="50000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_generation_tokens_bucket{le="100000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_generation_tokens_bucket{le="+Inf",name="google/gemma-2-9b-it"} 2104.0 +sglang:request_generation_tokens_count{name="google/gemma-2-9b-it"} 2104.0 +# HELP sglang:e2e_request_latency_seconds Histogram of End-to-end request latency in seconds +# TYPE sglang:e2e_request_latency_seconds histogram +sglang:e2e_request_latency_seconds_sum{name="google/gemma-2-9b-it"} 70517.99934530258 +sglang:e2e_request_latency_seconds_bucket{le="1.0",name="google/gemma-2-9b-it"} 2.0 +sglang:e2e_request_latency_seconds_bucket{le="2.0",name="google/gemma-2-9b-it"} 21.0 +sglang:e2e_request_latency_seconds_bucket{le="5.0",name="google/gemma-2-9b-it"} 54.0 +sglang:e2e_request_latency_seconds_bucket{le="10.0",name="google/gemma-2-9b-it"} 311.0 +sglang:e2e_request_latency_seconds_bucket{le="20.0",name="google/gemma-2-9b-it"} 733.0 +sglang:e2e_request_latency_seconds_bucket{le="50.0",name="google/gemma-2-9b-it"} 1563.0 +sglang:e2e_request_latency_seconds_bucket{le="100.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:e2e_request_latency_seconds_bucket{le="200.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:e2e_request_latency_seconds_bucket{le="500.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:e2e_request_latency_seconds_bucket{le="1000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:e2e_request_latency_seconds_bucket{le="2000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:e2e_request_latency_seconds_bucket{le="5000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:e2e_request_latency_seconds_bucket{le="10000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:e2e_request_latency_seconds_bucket{le="20000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:e2e_request_latency_seconds_bucket{le="50000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:e2e_request_latency_seconds_bucket{le="100000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:e2e_request_latency_seconds_bucket{le="+Inf",name="google/gemma-2-9b-it"} 2104.0 +sglang:e2e_request_latency_seconds_count{name="google/gemma-2-9b-it"} 2104.0 +# HELP sglang:waiting_request_latency_seconds Histogram of request waiting time in seconds +# TYPE sglang:waiting_request_latency_seconds histogram +sglang:waiting_request_latency_seconds_sum{name="google/gemma-2-9b-it"} 24885.007263183594 +sglang:waiting_request_latency_seconds_bucket{le="1.0",name="google/gemma-2-9b-it"} 421.0 +sglang:waiting_request_latency_seconds_bucket{le="2.0",name="google/gemma-2-9b-it"} 563.0 +sglang:waiting_request_latency_seconds_bucket{le="5.0",name="google/gemma-2-9b-it"} 900.0 +sglang:waiting_request_latency_seconds_bucket{le="10.0",name="google/gemma-2-9b-it"} 1270.0 +sglang:waiting_request_latency_seconds_bucket{le="20.0",name="google/gemma-2-9b-it"} 1623.0 +sglang:waiting_request_latency_seconds_bucket{le="50.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:waiting_request_latency_seconds_bucket{le="100.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:waiting_request_latency_seconds_bucket{le="200.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:waiting_request_latency_seconds_bucket{le="500.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:waiting_request_latency_seconds_bucket{le="1000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:waiting_request_latency_seconds_bucket{le="2000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:waiting_request_latency_seconds_bucket{le="5000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:waiting_request_latency_seconds_bucket{le="10000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:waiting_request_latency_seconds_bucket{le="20000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:waiting_request_latency_seconds_bucket{le="50000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:waiting_request_latency_seconds_bucket{le="100000.0",name="google/gemma-2-9b-it"} 2104.0 +sglang:waiting_request_latency_seconds_bucket{le="+Inf",name="google/gemma-2-9b-it"} 2104.0 +sglang:waiting_request_latency_seconds_count{name="google/gemma-2-9b-it"} 2104.0 +``` + +## Setup Guide + +To setup a monitoring dashboard, you can use the following docker compose file: [examples/monitoring/docker-compose.yaml](../examples/monitoring/docker-compose.yaml). + +Assume you have sglang server running at `localhost:30000`. + +To start the monitoring dashboard (prometheus + grafana), cd to `examples/monitoring` and run: + +```bash +docker compose -f compose.yaml -p monitoring up +``` + +Then you can access the Grafana dashboard at http://localhost:3000. + +### Grafana Dashboard + +To import the Grafana dashboard, click `+` -> `Import` -> `Upload JSON file` -> `Upload` and select [grafana.json](../examples/monitoring/grafana.json). diff --git a/examples/monitoring/docker-compose.yaml b/examples/monitoring/docker-compose.yaml new file mode 100644 index 0000000000..6c18b40469 --- /dev/null +++ b/examples/monitoring/docker-compose.yaml @@ -0,0 +1,16 @@ +services: + prometheus: + image: prom/prometheus:latest + network_mode: host + ports: + - "9090:9090" + volumes: + - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml + + grafana: + image: grafana/grafana:latest + network_mode: host + depends_on: + - prometheus + ports: + - "3000:3000" diff --git a/examples/monitoring/grafana.json b/examples/monitoring/grafana.json new file mode 100644 index 0000000000..e7d436de23 --- /dev/null +++ b/examples/monitoring/grafana.json @@ -0,0 +1,1720 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "panels": [ + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "description": "max-running-requests from server argument", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sglang:max_running_requests{name=\"$name\", instance=\"$instance\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Running Requests", + "type": "stat" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "description": "Supported context length with loaded model", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 3, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sglang:context_len{instance=\"$instance\", name=\"$name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Context Length", + "type": "stat" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "description": "max_total_tokens", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sglang:max_total_num_tokens{instance=\"$instance\", name=\"$name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Total Num Tokens", + "type": "stat" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "description": "max_prefill_tokens from server args", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sglang:max_prefill_tokens{instance=\"$instance\", name=\"$name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Prefill Tokens", + "type": "stat" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sglang:cached_token{instance=\"$instance\", name=\"$name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Cached Tokens", + "type": "stat" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sglang:cache_hit_rate{instance=\"$instance\", name=\"$name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Cache Hit Rate (%)", + "type": "stat" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 3 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate(sglang:e2e_request_latency_seconds_sum{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval]) / rate(sglang:e2e_request_latency_seconds_count[$__rate_interval])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "E", + "useBackend": false + } + ], + "title": "E2E Request Latency (S)", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 3 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "editorMode": "code", + "expr": "sglang:gen_throughput{instance=\"$instance\", name=\"$name\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Generation Throughput (Token / S)", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sglang:num_requests_running{instance=\"$instance\", name=\"$name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Num Requests Running", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sglang:num_requests_waiting{instance=\"$instance\", name=\"$name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Number of Requests Waiting", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(sglang:e2e_request_latency_seconds_sum{name=\"$name\"}[$__rate_interval]) / rate(sglang:e2e_request_latency_seconds_count{name=\"$name\"}[$__rate_interval])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "E", + "useBackend": false + } + ], + "title": "Time Request Decoding (S)", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "description": "Time requests waiting before added to batch", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket{name=\"$name\"}[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "editorMode": "code", + "expr": "rate(sglang:waiting_request_latency_seconds_sum{name=\"$name\"}[$__rate_interval])\r\n/\r\nrate(sglang:waiting_request_latency_seconds_count{name=\"$name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "E" + } + ], + "title": "Time Request Waiting (S)", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(rate(sglang:request_prompt_tokens_sum{instance=\"$instance\", name=\"$name\"}[$__rate_interval])) by (instance, name)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Prompt Tokens", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(rate(sglang:request_generation_tokens_sum{instance=\"$instance\", name=\"$name\"}[$__rate_interval])) by (instance, name)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Generated Tokens", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 13, + "options": { + "calculate": false, + "calculation": { + "yBuckets": { + "scale": { + "log": 2, + "type": "log" + } + } + }, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(le) (increase(sglang:request_prompt_tokens_bucket{name=\"$name\", instance=\"$instance\"}[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Request Prompt Tokens", + "type": "heatmap" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ee2vha8w6f5kwf" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 12, + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size", + "value": "" + }, + "yBuckets": { + "mode": "size", + "scale": { + "log": 2, + "type": "log" + }, + "value": "" + } + }, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto", + "value": "Request count" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisLabel": "Generation Length", + "axisPlacement": "left", + "reverse": false, + "unit": "none" + } + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(le) (increase(sglang:request_generation_tokens_bucket{name=\"$name\", instance=\"$instance\"}[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Request Generation Tokens", + "type": "heatmap" + } + ], + "refresh": "5s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "127.0.0.1:30000", + "value": "127.0.0.1:30000" + }, + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "definition": "label_values(instance)", + "hide": 0, + "includeAll": false, + "label": "instance", + "multi": false, + "name": "instance", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "google/gemma-2-9b-it", + "value": "google/gemma-2-9b-it" + }, + "definition": "label_values(name)", + "hide": 1, + "includeAll": false, + "label": "name", + "multi": false, + "name": "name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "SGLang Dashboard", + "uid": "ddyp55uq7brpcc", + "version": 3, + "weekStart": "" +} diff --git a/examples/monitoring/prometheus.yaml b/examples/monitoring/prometheus.yaml new file mode 100644 index 0000000000..ba16ac3bd3 --- /dev/null +++ b/examples/monitoring/prometheus.yaml @@ -0,0 +1,10 @@ +# prometheus.yaml +global: + scrape_interval: 5s + evaluation_interval: 30s + +scrape_configs: + - job_name: sglang + static_configs: + - targets: + - '127.0.0.1:30000' From f16eb15d0d4f6fbd48c2c8e1730c3ab14f9ecaa6 Mon Sep 17 00:00:00 2001 From: aqweteddy Date: Fri, 8 Nov 2024 14:42:27 +0800 Subject: [PATCH 08/12] Gemma2 reward model support (#1954) --- docs/references/supported_models.md | 3 +- python/sglang/srt/models/gemma2_reward.py | 103 ++++++++++++++++++++++ 2 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 python/sglang/srt/models/gemma2_reward.py diff --git a/docs/references/supported_models.md b/docs/references/supported_models.md index bfe1bc5528..ce178280b1 100644 --- a/docs/references/supported_models.md +++ b/docs/references/supported_models.md @@ -40,7 +40,8 @@ - LlamaForSequenceClassification - `python -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --is-embedding` - +- Gemma2ForSequenceClassification + - `python -m sglang.launch_server --model-path Skywork/Skywork-Reward-Gemma-2-27B-v0.2 --is-embedding` ## How to Support a New Model diff --git a/python/sglang/srt/models/gemma2_reward.py b/python/sglang/srt/models/gemma2_reward.py new file mode 100644 index 0000000000..5faadf67ff --- /dev/null +++ b/python/sglang/srt/models/gemma2_reward.py @@ -0,0 +1,103 @@ +""" +Copyright 2023-2024 SGLang Team +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from typing import Iterable, Optional, Tuple + +import torch +from torch import nn +from transformers import Gemma2Config +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + +from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType +from sglang.srt.layers.quantization.base_config import QuantizationConfig +from sglang.srt.model_executor.forward_batch_info import ForwardBatch +from sglang.srt.models.gemma2 import Gemma2ForCausalLM, Gemma2Model + + +class Gemma2ForSequenceClassification(nn.Module): + def __init__( + self, + config: Gemma2Config, + quant_config: Optional[QuantizationConfig] = None, + cache_config=None, + ) -> None: + super().__init__() + self.config = config + self.torchao_config = None + self.quant_config = quant_config + self.num_labels = config.num_labels + self.model = Gemma2Model(config, quant_config=quant_config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False) + + self.eos_token_id = config.eos_token_id + + @torch.no_grad() + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + forward_batch: ForwardBatch, + input_embeds: torch.Tensor = None, + get_embedding: bool = True, + ) -> EmbeddingPoolerOutput: + assert ( + get_embedding + ), "Gemma2ForSequenceClassification is only used for embedding" + + hidden_states = self.model(input_ids, positions, forward_batch, input_embeds) + scores = self.score(hidden_states) + + return self.pooler(scores, forward_batch) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + for param_name, shard_name, shard_id in stacked_params_mapping: + if shard_name not in name: + continue + name = name.replace(shard_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # lm_head is not used in vllm as it is tied with embed_token. + # To prevent errors, skip loading lm_head.weight. + if "lm_head.weight" in name: + continue + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + Gemma2ForCausalLM.load_weights(self, weights) + + +EntryClass = [Gemma2ForSequenceClassification] From 8dc84da08479aabcde2480e8a9c67c249595eb62 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 7 Nov 2024 23:15:08 -0800 Subject: [PATCH 09/12] Remove the useless to_srt_kwargs (#1955) --- python/sglang/srt/sampling/sampling_params.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/python/sglang/srt/sampling/sampling_params.py b/python/sglang/srt/sampling/sampling_params.py index fbe90ba0f7..a0cb8c74cd 100644 --- a/python/sglang/srt/sampling/sampling_params.py +++ b/python/sglang/srt/sampling/sampling_params.py @@ -133,17 +133,3 @@ def normalize(self, tokenizer): else: stop_str_max_len = max(stop_str_max_len, len(stop_str)) self.stop_str_max_len = stop_str_max_len - - def to_srt_kwargs(self): - return { - "max_new_tokens": self.max_new_tokens, - "stop": self.stop_strs, - "stop_token_ids": list(self.stop_token_ids), - "temperature": self.temperature, - "top_p": self.top_p, - "top_k": self.top_k, - "frequency_penalty": self.frequency_penalty, - "presence_penalty": self.presence_penalty, - "ignore_eos": self.ignore_eos, - "regex": self.regex, - } From 4ade15dd32397c0a45bd41202b9f949dd78cafe3 Mon Sep 17 00:00:00 2001 From: aqweteddy Date: Fri, 8 Nov 2024 16:10:54 +0800 Subject: [PATCH 10/12] Adjust reward model's score module and pooler module order for reducing computation (#1956) --- python/sglang/srt/models/gemma2_reward.py | 39 ++--------------------- python/sglang/srt/models/llama_reward.py | 29 +++-------------- 2 files changed, 8 insertions(+), 60 deletions(-) diff --git a/python/sglang/srt/models/gemma2_reward.py b/python/sglang/srt/models/gemma2_reward.py index 5faadf67ff..9aab3ce18e 100644 --- a/python/sglang/srt/models/gemma2_reward.py +++ b/python/sglang/srt/models/gemma2_reward.py @@ -58,43 +58,10 @@ def forward( ), "Gemma2ForSequenceClassification is only used for embedding" hidden_states = self.model(input_ids, positions, forward_batch, input_embeds) - scores = self.score(hidden_states) + last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings + scores = self.score(last_token_hidden) - return self.pooler(scores, forward_batch) - - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in weights: - for param_name, shard_name, shard_id in stacked_params_mapping: - if shard_name not in name: - continue - name = name.replace(shard_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # lm_head is not used in vllm as it is tied with embed_token. - # To prevent errors, skip loading lm_head.weight. - if "lm_head.weight" in name: - continue - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) + return EmbeddingPoolerOutput(scores) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): Gemma2ForCausalLM.load_weights(self, weights) diff --git a/python/sglang/srt/models/llama_reward.py b/python/sglang/srt/models/llama_reward.py index 5b68d1d321..e285ad6921 100644 --- a/python/sglang/srt/models/llama_reward.py +++ b/python/sglang/srt/models/llama_reward.py @@ -59,22 +59,13 @@ def forward( ), "LlamaForSequenceClassification is only used for embedding" hidden_states = self.model(input_ids, positions, forward_batch, input_embeds) - scores = self.score(hidden_states) + last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings + scores = self.score(last_token_hidden) - return self.pooler(scores, forward_batch) + return EmbeddingPoolerOutput(scores) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - params_dict = dict(self.named_parameters()) - - for name, loaded_weight in weights: - if "classification_head" in name: - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - elif "lm_head" in name: - continue - else: - LlamaForCausalLM.load_weights(self, [(name, loaded_weight)]) + return LlamaForCausalLM.load_weights(self, weights) class LlamaForSequenceClassificationWithNormal_Weights(LlamaForSequenceClassification): @@ -127,17 +118,7 @@ def forward( return EmbeddingPoolerOutput(scores) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - params_dict = dict(self.named_parameters()) - - for name, loaded_weight in weights: - if "classification_head" in name: - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - elif "lm_head" in name: - continue - else: - LlamaForCausalLM.load_weights(self, [(name, loaded_weight)]) + return super().load_weights(weights) EntryClass = [ From f9a377f6501b92896263a8210b45bfcaabe89f2a Mon Sep 17 00:00:00 2001 From: HAI Date: Fri, 8 Nov 2024 00:14:15 -0800 Subject: [PATCH 11/12] [Release, ROCm] release ROCm docker build for AMD MI GPUs (#1957) --- .github/workflows/release-docker-amd.yml | 55 ++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 .github/workflows/release-docker-amd.yml diff --git a/.github/workflows/release-docker-amd.yml b/.github/workflows/release-docker-amd.yml new file mode 100644 index 0000000000..866cc5fa52 --- /dev/null +++ b/.github/workflows/release-docker-amd.yml @@ -0,0 +1,55 @@ +name: Release Docker Images (AMD) +on: + push: + branches: + - main + paths: + - "python/sglang/version.py" + workflow_dispatch: + +jobs: + publish: + if: github.repository == 'sgl-project/sglang' + runs-on: docker-builder-amd + environment: 'prod' + strategy: + matrix: + rocm_version: ['6.2.0'] + build_type: ['all', 'srt'] + steps: + - name: Delete huge unnecessary tools folder + run: rm -rf /opt/hostedtoolcache + + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and Push + run: | + version=$(cat python/sglang/version.py | cut -d'"' -f2) + + if [ "${{ matrix.rocm_version }}" = "6.2.0" ]; then + rocm_tag="rocm620" + else + echo "Unsupported ROCm version" + exit 1 + fi + + tag=v${version}-${rocm_tag} + + if [ "${{ matrix.build_type }}" = "all" ]; then + tag_suffix="" + elif [ "${{ matrix.build_type }}" = "srt" ]; then + tag_suffix="-srt" + else + echo "Unsupported build type" + exit 1 + fi + + docker build . -f docker/Dockerfile.rocm --build-arg BUILD_TYPE=${{ matrix.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache + docker push lmsysorg/sglang:${tag}${tag_suffix} From 7ef0084b0d2e3b91fe1fa7cd5e396d47aa613797 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Fri, 8 Nov 2024 01:21:29 -0800 Subject: [PATCH 12/12] Add sentence_transformers to CI dependency (#1958) --- scripts/ci_install_dependency.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh index a13cfd214e..a219e02e21 100644 --- a/scripts/ci_install_dependency.sh +++ b/scripts/ci_install_dependency.sh @@ -4,5 +4,5 @@ Install the dependency in CI. pip install --upgrade pip pip install -e "python[all]" -pip install transformers==4.45.2 +pip install transformers==4.45.2 sentence_transformers pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall