From ae156ca736c26a149bcb1fefe3ef6fcf91b9728d Mon Sep 17 00:00:00 2001 From: Lyu Han Date: Wed, 21 Feb 2024 19:25:31 +0800 Subject: [PATCH] minor fix benchmark generation guide and script (#1175) * minor fix benchmark generation guide and script * update --- benchmark/benchmark_pytorch_engine_a100.sh | 12 ++++-- benchmark/benchmark_turbomind_engine_a100.sh | 45 +++++++++++--------- benchmark/profile_throughput.py | 8 ++-- docs/en/benchmark/profile_generation.md | 4 +- 4 files changed, 39 insertions(+), 30 deletions(-) diff --git a/benchmark/benchmark_pytorch_engine_a100.sh b/benchmark/benchmark_pytorch_engine_a100.sh index ac41ca06f..220eda241 100755 --- a/benchmark/benchmark_pytorch_engine_a100.sh +++ b/benchmark/benchmark_pytorch_engine_a100.sh @@ -5,25 +5,29 @@ dataset_path="benchmark/ShareGPT_V3_unfiltered_cleaned_split.json" ## 7B tp=1 max_batch_size=256 +cache_max_entry_count=0.95 model_path="/workspace/models-140/llama2/huggingface/llama-2-7b-chat" -CUDA_VISIBLE_DEVICES="4" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} +CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv ## 13B tp=1 max_batch_size=256 +cache_max_entry_count=0.9 model_path="/workspace/models-140/llama2/huggingface/llama-2-13b-chat" -CUDA_VISIBLE_DEVICES="4" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} +CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv # 20B tp=2 max_batch_size=256 +cache_max_entry_count=0.9 model_path="/workspace/models-140/InternLM/internlm-chat-20b" -CUDA_VISIBLE_DEVICES="4,5" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} +CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv # 70B tp=4 max_batch_size=256 +cache_max_entry_count=0.9 model_path="/workspace/models-140/llama2/huggingface/llama-2-70b-chat-hf" -CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} +CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv ########################################## PyTorch engine: w8a8 ########################################## diff --git a/benchmark/benchmark_turbomind_engine_a100.sh b/benchmark/benchmark_turbomind_engine_a100.sh index e0e5197e7..5ba0f12e8 100755 --- a/benchmark/benchmark_turbomind_engine_a100.sh +++ b/benchmark/benchmark_turbomind_engine_a100.sh @@ -2,62 +2,65 @@ dataset_path="benchmark/ShareGPT_V3_unfiltered_cleaned_split.json" ########################################## TurboMind engine: fp16 or bf16 ########################################## -## 7B. gemm_tune -> profile_throughput +# 7B. gemm_tune -> profile_throughput tp=1 max_batch_size=256 cache_max_entry_count=0.95 model_path="/workspace/models-140/llama2/huggingface/llama-2-7b-chat" -python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path} -python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} +CUDA_VISIBLE_DEVICES="6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path} +CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_7b_thr.csv +rm gemm_config.in -## 13B. gemm_tune -> profile_throughput +# 13B. gemm_tune -> profile_throughput tp=1 max_batch_size=256 cache_max_entry_count=0.9 model_path="/workspace/models-140/llama2/huggingface/llama-2-13b-chat" -python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path} -python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} +CUDA_VISIBLE_DEVICES="6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path} +CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_13b_thr.csv +rm gemm_config.in # 20B. gemm_tune -> profile_throughput tp=2 max_batch_size=256 cache_max_entry_count=0.9 model_path="/workspace/models-140/InternLM/internlm-chat-20b" -python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path} -python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} +CUDA_VISIBLE_DEVICES="5,6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path} +CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv internlm_tb_20b_thr.csv +rm gemm_config.in # 70B -tp=1 +tp=4 max_batch_size=256 cache_max_entry_count=0.9 model_path="/workspace/models-140/llama2/huggingface/llama-2-70b-chat-hf" -python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} +CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_70b_thr.csv # ########################################## TurboMind engine: w4a16 ########################################## -## 7B +# 7B tp=1 max_batch_size=256 cache_max_entry_count=0.95 model_path="/workspace/models/quantization/llama-2-7b-chat-4bit" -python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 +CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_7b_4bit_thr.csv -## 13B +# 13B tp=1 max_batch_size=256 cache_max_entry_count=0.9 -model_path="/workspace/models-140/llama2/huggingface/llama-2-13b-chat" -python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 +model_path="/workspace/models/quantization/llama-2-13b-chat-4bit" +CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_13b_4bit_thr.csv -## 20B +# 20B tp=2 max_batch_size=256 cache_max_entry_count=0.9 -model_path="/workspace/models-140/InternLM/internlm-chat-20b" -python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 +model_path="/workspace/models/quantization/internlm-chat-20b-4bit" +CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv internlm_tb_20b_4bit_thr.csv -## 70B +# 70B tp=4 max_batch_size=256 cache_max_entry_count=0.9 -model_path="/workspace/models-140/llama2/huggingface/llama-2-70b-chat-hf" -python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 +model_path="/workspace/models/quantization/llama-2-70b-chat-hf-4bit" +CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_70b_4bit_thr.csv diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 2e5609c26..d96614929 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -303,9 +303,11 @@ def main(): cache_max_entry_count=args.cache_max_entry_count, model_format=args.model_format) elif args.backend == 'pytorch': - engine_config = PytorchEngineConfig(session_len=args.session_len, - max_batch_size=args.concurrency, - tp=args.tp) + engine_config = PytorchEngineConfig( + session_len=args.session_len, + cache_max_entry_count=args.cache_max_entry_count, + max_batch_size=args.concurrency, + tp=args.tp) engine = Engine(args.model_path, engine_config, csv=args.csv) diff --git a/docs/en/benchmark/profile_generation.md b/docs/en/benchmark/profile_generation.md index ffba77bed..1ee8f8549 100644 --- a/docs/en/benchmark/profile_generation.md +++ b/docs/en/benchmark/profile_generation.md @@ -1,6 +1,6 @@ -# Profile Static Inference Performance +# Profile Token Latency and Throughput -We view the performance of the inference engine under the fixed batch and fixed input/output token as static inference performance. +We profile the latency and throughput of generated tokens with fixed batch size and fixed input/output token. The profiling script is `profile_generation.py`. Before running it, please install the lmdeploy precompiled package and download the profiling script: