Skip to content

Commit

Permalink
add load test notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
nielsrolf committed Feb 11, 2025
1 parent cf6609a commit 41c7543
Showing 1 changed file with 212 additions and 0 deletions.
212 changes: 212 additions & 0 deletions example/load_test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import asyncio\n",
"from openweights import OpenWeights # type: ignore\n",
"from dotenv import load_dotenv # type: ignore\n",
"import random\n",
"load_dotenv()\n",
"ow = OpenWeights()\n",
"\n",
"\n",
"from dataclasses import dataclass, asdict\n",
"import time\n",
"import json\n",
"import plotly.express as px\n",
"import plotly.figure_factory as ff\n",
"import pandas as pd\n",
"from typing import List, Dict\n",
"import os\n",
"\n",
"@dataclass \n",
"class RequestResult:\n",
" \"\"\"Results for a single request\"\"\"\n",
" completion_time: float\n",
" total_tokens: int\n",
" prompt_tokens: int\n",
" completion_tokens: int\n",
"\n",
"@dataclass\n",
"class LoadTestResult:\n",
" \"\"\"Class to save all API and client args together with load test results\"\"\"\n",
" name: str\n",
" model: str\n",
" request_timeout: float \n",
" per_token_timeout: float\n",
" max_num_seqs: int\n",
" dataset_size: int\n",
" n_gpus: int\n",
" total_time: float\n",
" total_tokens: int\n",
" total_requests: int\n",
" throughput_requests: float # requests per second\n",
" throughput_tokens: float # tokens per second\n",
" avg_latency: float # seconds per request\n",
" per_request_results: List[Dict] # List of RequestResult as dicts\n",
" \n",
" def to_json(self, filename):\n",
" os.makedirs(os.path.dirname(filename), exist_ok=True)\n",
" with open(filename, 'w') as f:\n",
" json.dump(asdict(self), f, indent=2)\n",
" \n",
" @classmethod\n",
" def from_json(cls, filename):\n",
" with open(filename) as f:\n",
" data = json.load(f)\n",
" return cls(**data)\n",
"\n",
"def plot_results(results):\n",
" \"\"\"Create interactive plots of results\"\"\"\n",
" # Latency vs Throughput scatter plot\n",
" df = pd.DataFrame([asdict(r) for r in results])\n",
" fig1 = px.scatter(\n",
" df,\n",
" x=\"throughput_requests\",\n",
" y=\"avg_latency\",\n",
" hover_data=[\"name\", \"model\", \"max_num_seqs\", \"n_gpus\"],\n",
" title=\"Load Test Results - Latency vs Throughput\"\n",
" )\n",
" fig1.show()\n",
" \n",
" # Distribution of completion times\n",
" for result in results:\n",
" df_requests = pd.DataFrame(result.per_request_results)\n",
" fig2 = ff.create_distplot(\n",
" [df_requests['completion_time']], \n",
" [result.name],\n",
" bin_size=0.1\n",
" )\n",
" fig2.update_layout(\n",
" title=f\"Distribution of Completion Times - {result.name}\",\n",
" xaxis_title=\"Completion Time (s)\",\n",
" yaxis_title=\"Density\"\n",
" )\n",
" fig2.show()\n",
"\n",
"def get_dataset(size):\n",
" \"\"\"Generate test dataset\"\"\"\n",
" return [\n",
" [{\"role\": \"user\", \"content\": \"Please explain in great detail the history of China. Start with a general history, then add chaopter that explain in detail the history of every major city, and then add chapters that explain the history of every major dynasty. Be very detailed and resemble the style of wikipedia.\"}]\n",
" for _ in range(size)\n",
" ]\n",
"\n",
"async def load_test(\n",
" name: str,\n",
" model = \"unsloth/Qwen2.5-32B-Instruct\",\n",
" request_timeout: float = 5,\n",
" per_token_timeout: float = 1,\n",
" max_num_seqs: int = 100,\n",
" dataset_size: int = 1000,\n",
" n_gpus: int = 1,\n",
" n_tokens=[600]\n",
"):\n",
" \"\"\"Deploy a model with the given vllm/client args and test the performance\"\"\"\n",
" deploy_kwargs = dict(\n",
" max_num_seqs=max_num_seqs,\n",
" requires_vram_gb = n_gpus * 65,\n",
" )\n",
" dataset = get_dataset(dataset_size)\n",
" sem = asyncio.Semaphore(max_num_seqs)\n",
" \n",
" start_time = time.time()\n",
" async with ow.deploy(model, **deploy_kwargs):\n",
" ow.chat.request_timeout = request_timeout\n",
" ow.chat.per_token_timeout = per_token_timeout\n",
"\n",
" async def timed_request(messages):\n",
" async with sem:\n",
" req_start = time.time()\n",
" n_tokens_choice = random.choice(n_tokens)\n",
" response = await ow.async_chat.completions.create(model=model, messages=messages, max_tokens=n_tokens_choice)\n",
" print(f\"Completion time: {time.time() - req_start:.2f}s\")\n",
" return RequestResult(\n",
" completion_time=time.time() - req_start,\n",
" total_tokens=response.usage.total_tokens,\n",
" prompt_tokens=response.usage.prompt_tokens,\n",
" completion_tokens=response.usage.completion_tokens\n",
" )\n",
"\n",
" responses = await asyncio.gather(\n",
" *[timed_request(messages) for messages in dataset]\n",
" )\n",
" total_time = time.time() - start_time\n",
" \n",
" total_tokens = sum(r.total_tokens for r in responses)\n",
" total_requests = len(responses)\n",
" \n",
" result = LoadTestResult(\n",
" name=name,\n",
" model=model,\n",
" request_timeout=request_timeout,\n",
" per_token_timeout=per_token_timeout,\n",
" max_num_seqs=max_num_seqs,\n",
" dataset_size=dataset_size,\n",
" n_gpus=n_gpus,\n",
" total_time=total_time,\n",
" total_tokens=total_tokens,\n",
" total_requests=total_requests,\n",
" throughput_requests=total_requests/total_time,\n",
" throughput_tokens=total_tokens/total_time,\n",
" avg_latency=total_time/total_requests,\n",
" per_request_results=[asdict(r) for r in responses]\n",
" )\n",
" \n",
" result.to_json(f\"results/{name}.json\")\n",
" return result\n",
"\n",
"\n",
"async def eval_max_num_seqs(**_load_test_kwargs):\n",
" \"\"\"Evaluate the impact of max_num_seqs on throughput\"\"\"\n",
" results = []\n",
" for max_num_seqs in [1, 10, 100]:\n",
" name = f\"max_num_seqs_{max_num_seqs}\"\n",
" results.append(await load_test(name, max_num_seqs=max_num_seqs, **_load_test_kwargs))\n",
" plot_results(results)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"await eval_max_num_seqs(n_tokens=[600], request_timeout=600)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"await eval_max_num_seqs(max_tokens=[1000])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "kva",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 41c7543

Please sign in to comment.