From ae156ca736c26a149bcb1fefe3ef6fcf91b9728d Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Wed, 21 Feb 2024 19:25:31 +0800
Subject: [PATCH] minor fix benchmark generation guide and script (#1175)

* minor fix benchmark generation guide and script

* update
---
 benchmark/benchmark_pytorch_engine_a100.sh   | 12 ++++--
 benchmark/benchmark_turbomind_engine_a100.sh | 45 +++++++++++---------
 benchmark/profile_throughput.py              |  8 ++--
 docs/en/benchmark/profile_generation.md      |  4 +-
 4 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/benchmark/benchmark_pytorch_engine_a100.sh b/benchmark/benchmark_pytorch_engine_a100.sh
index ac41ca06f..220eda241 100755
--- a/benchmark/benchmark_pytorch_engine_a100.sh
+++ b/benchmark/benchmark_pytorch_engine_a100.sh
@@ -5,25 +5,29 @@ dataset_path="benchmark/ShareGPT_V3_unfiltered_cleaned_split.json"
 ## 7B
 tp=1
 max_batch_size=256
+cache_max_entry_count=0.95
 model_path="/workspace/models-140/llama2/huggingface/llama-2-7b-chat"
-CUDA_VISIBLE_DEVICES="4" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size}
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
 
 ## 13B
 tp=1
 max_batch_size=256
+cache_max_entry_count=0.9
 model_path="/workspace/models-140/llama2/huggingface/llama-2-13b-chat"
-CUDA_VISIBLE_DEVICES="4" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size}
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
 
 # 20B
 tp=2
 max_batch_size=256
+cache_max_entry_count=0.9
 model_path="/workspace/models-140/InternLM/internlm-chat-20b"
-CUDA_VISIBLE_DEVICES="4,5" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path}  --backend pytorch --tp ${tp} --concurrency ${max_batch_size}
+CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path}  --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
 
 # 70B
 tp=4
 max_batch_size=256
+cache_max_entry_count=0.9
 model_path="/workspace/models-140/llama2/huggingface/llama-2-70b-chat-hf"
-CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path}  --backend pytorch --tp ${tp} --concurrency ${max_batch_size}
+CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path}  --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
 
 ########################################## PyTorch engine: w8a8 ##########################################
diff --git a/benchmark/benchmark_turbomind_engine_a100.sh b/benchmark/benchmark_turbomind_engine_a100.sh
index e0e5197e7..5ba0f12e8 100755
--- a/benchmark/benchmark_turbomind_engine_a100.sh
+++ b/benchmark/benchmark_turbomind_engine_a100.sh
@@ -2,62 +2,65 @@
 
 dataset_path="benchmark/ShareGPT_V3_unfiltered_cleaned_split.json"
 ########################################## TurboMind engine: fp16 or bf16 ##########################################
-## 7B. gemm_tune -> profile_throughput
+# 7B. gemm_tune -> profile_throughput
 tp=1
 max_batch_size=256
 cache_max_entry_count=0.95
 model_path="/workspace/models-140/llama2/huggingface/llama-2-7b-chat"
-python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
-python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count}
+CUDA_VISIBLE_DEVICES="6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_7b_thr.csv
+rm gemm_config.in
 
-## 13B. gemm_tune -> profile_throughput
+# 13B. gemm_tune -> profile_throughput
 tp=1
 max_batch_size=256
 cache_max_entry_count=0.9
 model_path="/workspace/models-140/llama2/huggingface/llama-2-13b-chat"
-python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
-python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count}
+CUDA_VISIBLE_DEVICES="6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_13b_thr.csv
+rm gemm_config.in
 
 # 20B. gemm_tune -> profile_throughput
 tp=2
 max_batch_size=256
 cache_max_entry_count=0.9
 model_path="/workspace/models-140/InternLM/internlm-chat-20b"
-python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
-python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count}
+CUDA_VISIBLE_DEVICES="5,6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
+CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv internlm_tb_20b_thr.csv
+rm gemm_config.in
 
 # 70B
-tp=1
+tp=4
 max_batch_size=256
 cache_max_entry_count=0.9
 model_path="/workspace/models-140/llama2/huggingface/llama-2-70b-chat-hf"
-python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count}
+CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_70b_thr.csv
 
 # ########################################## TurboMind engine: w4a16 ##########################################
-## 7B
+# 7B
 tp=1
 max_batch_size=256
 cache_max_entry_count=0.95
 model_path="/workspace/models/quantization/llama-2-7b-chat-4bit"
-python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_7b_4bit_thr.csv
 
-## 13B
+# 13B
 tp=1
 max_batch_size=256
 cache_max_entry_count=0.9
-model_path="/workspace/models-140/llama2/huggingface/llama-2-13b-chat"
-python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000
+model_path="/workspace/models/quantization/llama-2-13b-chat-4bit"
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_13b_4bit_thr.csv
 
-## 20B
+# 20B
 tp=2
 max_batch_size=256
 cache_max_entry_count=0.9
-model_path="/workspace/models-140/InternLM/internlm-chat-20b"
-python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000
+model_path="/workspace/models/quantization/internlm-chat-20b-4bit"
+CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv internlm_tb_20b_4bit_thr.csv
 
-## 70B
+# 70B
 tp=4
 max_batch_size=256
 cache_max_entry_count=0.9
-model_path="/workspace/models-140/llama2/huggingface/llama-2-70b-chat-hf"
-python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000
+model_path="/workspace/models/quantization/llama-2-70b-chat-hf-4bit"
+CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_70b_4bit_thr.csv
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index 2e5609c26..d96614929 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -303,9 +303,11 @@ def main():
             cache_max_entry_count=args.cache_max_entry_count,
             model_format=args.model_format)
     elif args.backend == 'pytorch':
-        engine_config = PytorchEngineConfig(session_len=args.session_len,
-                                            max_batch_size=args.concurrency,
-                                            tp=args.tp)
+        engine_config = PytorchEngineConfig(
+            session_len=args.session_len,
+            cache_max_entry_count=args.cache_max_entry_count,
+            max_batch_size=args.concurrency,
+            tp=args.tp)
 
     engine = Engine(args.model_path, engine_config, csv=args.csv)
 
diff --git a/docs/en/benchmark/profile_generation.md b/docs/en/benchmark/profile_generation.md
index ffba77bed..1ee8f8549 100644
--- a/docs/en/benchmark/profile_generation.md
+++ b/docs/en/benchmark/profile_generation.md
@@ -1,6 +1,6 @@
-# Profile Static Inference Performance
+# Profile Token Latency and Throughput
 
-We view the performance of the inference engine under the fixed batch and fixed input/output token as static inference performance.
+We profile the latency and throughput of generated tokens with fixed batch size and fixed input/output token.
 
 The profiling script is `profile_generation.py`. Before running it, please install the lmdeploy precompiled package and download the profiling script: