-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
2,230 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# TRT-LLM on H100 | ||
|
||
1. Setup Virtual Environment | ||
|
||
```bash | ||
module use /soft/modulefiles/ | ||
module load conda | ||
module load openmpi/4.1.1-nvhpc | ||
|
||
conda create -n TensorRT_LLM python=3.10 | ||
conda activate TensorRT_LLM | ||
conda install -c conda-forge mpi4py openmpi | ||
|
||
``` | ||
|
||
2. Install Dependancies | ||
```bash | ||
git clone https://github.com/NVIDIA/TensorRT-LLM.git | ||
cd TensorRT-LLM | ||
cd examples/llama/ | ||
MPICC=$(which mpicc) MPICXX=$(which mpicxx) pip install -r requirements.txt | ||
``` | ||
|
||
3. Running single Benchmark | ||
```bash | ||
export dir_1=<HF weights path> | ||
export dir_2=<trt weights path> | ||
export dir_3=<trt engines path> | ||
python convert_checkpoint.py --tp_size=1 --model_dir=$dir_1 --output_dir=$dir_2 --dtype=float16 | ||
trtllm-build --checkpoint_dir=$dir_2 --output_dir=$dir_3 --gemm_plugin=float16 --max_batch_size=1 --max_input_len=128 --max_output_len=128 | ||
python3 ../run.py --model_name="mistral_7b" --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=128 --max_input_length=$input_output_length --run_profiling --batch_size=1 | ||
``` | ||
|
||
4. Replaces or Copy files `run_power.py`, `run_precision_bench.py`, `utils.py` and `run.py` from this directory to clones trt-llm directory. | ||
|
||
5. Run benchmarks. | ||
Use `p-llama2-7b.sh` to run power benchmakrs. | ||
Use `q-llama2-7b.sh` to run precision benchmarks. | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
export HF_TOKEN="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" | ||
export HF_HOME="/vast/users/sraskar/mi250/hf/hub" | ||
export HF_DATASETS_CACHE="/vast/users/sraskar/mi250/hf/hub" | ||
|
||
pip install pynvml==11.5.0 | ||
pip install pydantic-core==2.18.1 | ||
pip install psutil | ||
pip install py3nvml | ||
|
||
cd /vast/users/sraskar/h100/llm_research/tensorRT/new/TensorRT-LLM/examples/llama/ | ||
|
||
model_name="meta-llama/Llama-2-7b-hf" | ||
dir_1="/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9" | ||
dir_2="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_weights/Llama2-7b" | ||
dir_3="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_binaries/Llama2-7b" | ||
|
||
|
||
|
||
|
||
for tensor_parallel in 1; do | ||
for precision in "float16"; do | ||
rm -rf $dir_2/* | ||
rm -rf $dir_3/* | ||
python convert_checkpoint.py --workers=64 --tp_size=$tensor_parallel --model_dir=$dir_1 --output_dir=$dir_2 --dtype=$precision | ||
for batch_size in 1 16 32 64; do | ||
for input_output_length in 1024; do | ||
trtllm-build --workers=64 --tp_size=$tensor_parallel --checkpoint_dir=$dir_2 --output_dir=$dir_3 --gemm_plugin=$precision --gpt_attention_plugin=$precision --max_batch_size=$batch_size --max_input_len=$input_output_length | ||
mpirun -np $tensor_parallel python3 ../run_power.py --model_name=$model_name --tp_size=$tensor_parallel --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=$input_output_length --max_input_length=$input_output_length --run_profiling --batch_size=$batch_size | ||
done | ||
done | ||
done | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
export HF_TOKEN="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" | ||
export HF_HOME="/vast/users/sraskar/mi250/hf/hub" | ||
export HF_DATASETS_CACHE="/vast/users/sraskar/mi250/hf/hub" | ||
|
||
pip install pynvml==11.5.0 | ||
pip install pydantic-core==2.18.1 | ||
# pip install psutil | ||
pip install psutil==5.9.8 | ||
|
||
pip install pydantic==2.7.0 | ||
pip install regex==2024.5.15 | ||
|
||
cd /vast/users/sraskar/h100/llm_research/tensorRT/new/TensorRT-LLM/examples/llama/ | ||
|
||
model_name="meta-llama/Llama-2-7b-hf" | ||
dir_1="/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9" | ||
# dir_2="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_weights/Llama2-7b" | ||
# dir_3="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_binaries/Llama2-7b" | ||
dir_2="." | ||
dir_3="." | ||
|
||
|
||
for tensor_parallel in 1; do | ||
for precision in "full_prec" "int8_sq" "int4_awq"; do | ||
for kv_cache_precision in "int8" "fp8"; do | ||
# rm -rf $dir_2/* | ||
# rm -rf $dir_3/* | ||
python ../quantization/quantize.py --model_dir $dir_1 --dtype float16 --qformat $precision --kv_cache_dtype $kv_cache_precision --output_dir $dir_2 --calib_size 10 --tp_size $tensor_parallel --batch_size=1 | ||
for batch_size in 1 16 32 64; do | ||
for input_output_length in 1024; do | ||
trtllm-build --workers=48 --tp_size=$tensor_parallel --checkpoint_dir=$dir_2 --output_dir=$dir_3 --max_batch_size=$batch_size --max_input_len=$input_output_length | ||
mpirun -np $tensor_parallel python3 ../run_precision.py --qformat $precision --kv_cache_dtype $kv_cache_precision --model_name=$model_name --tp_size=$tensor_parallel --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=$input_output_length --max_input_length=$input_output_length --run_profiling --batch_size=$batch_size | ||
done | ||
done | ||
done | ||
done | ||
done |
Oops, something went wrong.