-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
212 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import asyncio\n", | ||
"from openweights import OpenWeights # type: ignore\n", | ||
"from dotenv import load_dotenv # type: ignore\n", | ||
"import random\n", | ||
"load_dotenv()\n", | ||
"ow = OpenWeights()\n", | ||
"\n", | ||
"\n", | ||
"from dataclasses import dataclass, asdict\n", | ||
"import time\n", | ||
"import json\n", | ||
"import plotly.express as px\n", | ||
"import plotly.figure_factory as ff\n", | ||
"import pandas as pd\n", | ||
"from typing import List, Dict\n", | ||
"import os\n", | ||
"\n", | ||
"@dataclass \n", | ||
"class RequestResult:\n", | ||
" \"\"\"Results for a single request\"\"\"\n", | ||
" completion_time: float\n", | ||
" total_tokens: int\n", | ||
" prompt_tokens: int\n", | ||
" completion_tokens: int\n", | ||
"\n", | ||
"@dataclass\n", | ||
"class LoadTestResult:\n", | ||
" \"\"\"Class to save all API and client args together with load test results\"\"\"\n", | ||
" name: str\n", | ||
" model: str\n", | ||
" request_timeout: float \n", | ||
" per_token_timeout: float\n", | ||
" max_num_seqs: int\n", | ||
" dataset_size: int\n", | ||
" n_gpus: int\n", | ||
" total_time: float\n", | ||
" total_tokens: int\n", | ||
" total_requests: int\n", | ||
" throughput_requests: float # requests per second\n", | ||
" throughput_tokens: float # tokens per second\n", | ||
" avg_latency: float # seconds per request\n", | ||
" per_request_results: List[Dict] # List of RequestResult as dicts\n", | ||
" \n", | ||
" def to_json(self, filename):\n", | ||
" os.makedirs(os.path.dirname(filename), exist_ok=True)\n", | ||
" with open(filename, 'w') as f:\n", | ||
" json.dump(asdict(self), f, indent=2)\n", | ||
" \n", | ||
" @classmethod\n", | ||
" def from_json(cls, filename):\n", | ||
" with open(filename) as f:\n", | ||
" data = json.load(f)\n", | ||
" return cls(**data)\n", | ||
"\n", | ||
"def plot_results(results):\n", | ||
" \"\"\"Create interactive plots of results\"\"\"\n", | ||
" # Latency vs Throughput scatter plot\n", | ||
" df = pd.DataFrame([asdict(r) for r in results])\n", | ||
" fig1 = px.scatter(\n", | ||
" df,\n", | ||
" x=\"throughput_requests\",\n", | ||
" y=\"avg_latency\",\n", | ||
" hover_data=[\"name\", \"model\", \"max_num_seqs\", \"n_gpus\"],\n", | ||
" title=\"Load Test Results - Latency vs Throughput\"\n", | ||
" )\n", | ||
" fig1.show()\n", | ||
" \n", | ||
" # Distribution of completion times\n", | ||
" for result in results:\n", | ||
" df_requests = pd.DataFrame(result.per_request_results)\n", | ||
" fig2 = ff.create_distplot(\n", | ||
" [df_requests['completion_time']], \n", | ||
" [result.name],\n", | ||
" bin_size=0.1\n", | ||
" )\n", | ||
" fig2.update_layout(\n", | ||
" title=f\"Distribution of Completion Times - {result.name}\",\n", | ||
" xaxis_title=\"Completion Time (s)\",\n", | ||
" yaxis_title=\"Density\"\n", | ||
" )\n", | ||
" fig2.show()\n", | ||
"\n", | ||
"def get_dataset(size):\n", | ||
" \"\"\"Generate test dataset\"\"\"\n", | ||
" return [\n", | ||
" [{\"role\": \"user\", \"content\": \"Please explain in great detail the history of China. Start with a general history, then add chaopter that explain in detail the history of every major city, and then add chapters that explain the history of every major dynasty. Be very detailed and resemble the style of wikipedia.\"}]\n", | ||
" for _ in range(size)\n", | ||
" ]\n", | ||
"\n", | ||
"async def load_test(\n", | ||
" name: str,\n", | ||
" model = \"unsloth/Qwen2.5-32B-Instruct\",\n", | ||
" request_timeout: float = 5,\n", | ||
" per_token_timeout: float = 1,\n", | ||
" max_num_seqs: int = 100,\n", | ||
" dataset_size: int = 1000,\n", | ||
" n_gpus: int = 1,\n", | ||
" n_tokens=[600]\n", | ||
"):\n", | ||
" \"\"\"Deploy a model with the given vllm/client args and test the performance\"\"\"\n", | ||
" deploy_kwargs = dict(\n", | ||
" max_num_seqs=max_num_seqs,\n", | ||
" requires_vram_gb = n_gpus * 65,\n", | ||
" )\n", | ||
" dataset = get_dataset(dataset_size)\n", | ||
" sem = asyncio.Semaphore(max_num_seqs)\n", | ||
" \n", | ||
" start_time = time.time()\n", | ||
" async with ow.deploy(model, **deploy_kwargs):\n", | ||
" ow.chat.request_timeout = request_timeout\n", | ||
" ow.chat.per_token_timeout = per_token_timeout\n", | ||
"\n", | ||
" async def timed_request(messages):\n", | ||
" async with sem:\n", | ||
" req_start = time.time()\n", | ||
" n_tokens_choice = random.choice(n_tokens)\n", | ||
" response = await ow.async_chat.completions.create(model=model, messages=messages, max_tokens=n_tokens_choice)\n", | ||
" print(f\"Completion time: {time.time() - req_start:.2f}s\")\n", | ||
" return RequestResult(\n", | ||
" completion_time=time.time() - req_start,\n", | ||
" total_tokens=response.usage.total_tokens,\n", | ||
" prompt_tokens=response.usage.prompt_tokens,\n", | ||
" completion_tokens=response.usage.completion_tokens\n", | ||
" )\n", | ||
"\n", | ||
" responses = await asyncio.gather(\n", | ||
" *[timed_request(messages) for messages in dataset]\n", | ||
" )\n", | ||
" total_time = time.time() - start_time\n", | ||
" \n", | ||
" total_tokens = sum(r.total_tokens for r in responses)\n", | ||
" total_requests = len(responses)\n", | ||
" \n", | ||
" result = LoadTestResult(\n", | ||
" name=name,\n", | ||
" model=model,\n", | ||
" request_timeout=request_timeout,\n", | ||
" per_token_timeout=per_token_timeout,\n", | ||
" max_num_seqs=max_num_seqs,\n", | ||
" dataset_size=dataset_size,\n", | ||
" n_gpus=n_gpus,\n", | ||
" total_time=total_time,\n", | ||
" total_tokens=total_tokens,\n", | ||
" total_requests=total_requests,\n", | ||
" throughput_requests=total_requests/total_time,\n", | ||
" throughput_tokens=total_tokens/total_time,\n", | ||
" avg_latency=total_time/total_requests,\n", | ||
" per_request_results=[asdict(r) for r in responses]\n", | ||
" )\n", | ||
" \n", | ||
" result.to_json(f\"results/{name}.json\")\n", | ||
" return result\n", | ||
"\n", | ||
"\n", | ||
"async def eval_max_num_seqs(**_load_test_kwargs):\n", | ||
" \"\"\"Evaluate the impact of max_num_seqs on throughput\"\"\"\n", | ||
" results = []\n", | ||
" for max_num_seqs in [1, 10, 100]:\n", | ||
" name = f\"max_num_seqs_{max_num_seqs}\"\n", | ||
" results.append(await load_test(name, max_num_seqs=max_num_seqs, **_load_test_kwargs))\n", | ||
" plot_results(results)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"await eval_max_num_seqs(n_tokens=[600], request_timeout=600)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"await eval_max_num_seqs(max_tokens=[1000])" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "kva", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |