Skip to content

Commit

Permalink
trtllm h100 content
Browse files Browse the repository at this point in the history
  • Loading branch information
sraskar committed Sep 8, 2024
1 parent afdf070 commit 47d0f04
Show file tree
Hide file tree
Showing 7 changed files with 2,230 additions and 0 deletions.
56 changes: 56 additions & 0 deletions TensorRT-LLM/H100/README.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# TRT-LLM on H100

1. Setup Virtual Environment

```bash
module use /soft/modulefiles/
module load conda
module load openmpi/4.1.1-nvhpc

conda create -n TensorRT_LLM python=3.10
conda activate TensorRT_LLM
conda install -c conda-forge mpi4py openmpi

```

2. Install Dependancies
```bash
git clone https://github.com/NVIDIA/TensorRT-LLM.git
cd TensorRT-LLM
cd examples/llama/
MPICC=$(which mpicc) MPICXX=$(which mpicxx) pip install -r requirements.txt
```

3. Running single Benchmark
```bash
export dir_1=<HF weights path>
export dir_2=<trt weights path>
export dir_3=<trt engines path>
python convert_checkpoint.py --tp_size=1 --model_dir=$dir_1 --output_dir=$dir_2 --dtype=float16
trtllm-build --checkpoint_dir=$dir_2 --output_dir=$dir_3 --gemm_plugin=float16 --max_batch_size=1 --max_input_len=128 --max_output_len=128
python3 ../run.py --model_name="mistral_7b" --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=128 --max_input_length=$input_output_length --run_profiling --batch_size=1
```

4. Replaces or Copy files `run_power.py`, `run_precision_bench.py`, `utils.py` and `run.py` from this directory to clones trt-llm directory.

5. Run benchmarks.
Use `p-llama2-7b.sh` to run power benchmakrs.
Use `q-llama2-7b.sh` to run precision benchmarks.












32 changes: 32 additions & 0 deletions TensorRT-LLM/H100/p-llama2-7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
export HF_TOKEN="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"
export HF_HOME="/vast/users/sraskar/mi250/hf/hub"
export HF_DATASETS_CACHE="/vast/users/sraskar/mi250/hf/hub"

pip install pynvml==11.5.0
pip install pydantic-core==2.18.1
pip install psutil
pip install py3nvml

cd /vast/users/sraskar/h100/llm_research/tensorRT/new/TensorRT-LLM/examples/llama/

model_name="meta-llama/Llama-2-7b-hf"
dir_1="/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9"
dir_2="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_weights/Llama2-7b"
dir_3="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_binaries/Llama2-7b"




for tensor_parallel in 1; do
for precision in "float16"; do
rm -rf $dir_2/*
rm -rf $dir_3/*
python convert_checkpoint.py --workers=64 --tp_size=$tensor_parallel --model_dir=$dir_1 --output_dir=$dir_2 --dtype=$precision
for batch_size in 1 16 32 64; do
for input_output_length in 1024; do
trtllm-build --workers=64 --tp_size=$tensor_parallel --checkpoint_dir=$dir_2 --output_dir=$dir_3 --gemm_plugin=$precision --gpt_attention_plugin=$precision --max_batch_size=$batch_size --max_input_len=$input_output_length
mpirun -np $tensor_parallel python3 ../run_power.py --model_name=$model_name --tp_size=$tensor_parallel --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=$input_output_length --max_input_length=$input_output_length --run_profiling --batch_size=$batch_size
done
done
done
done
37 changes: 37 additions & 0 deletions TensorRT-LLM/H100/q-llama2-7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
export HF_TOKEN="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"
export HF_HOME="/vast/users/sraskar/mi250/hf/hub"
export HF_DATASETS_CACHE="/vast/users/sraskar/mi250/hf/hub"

pip install pynvml==11.5.0
pip install pydantic-core==2.18.1
# pip install psutil
pip install psutil==5.9.8

pip install pydantic==2.7.0
pip install regex==2024.5.15

cd /vast/users/sraskar/h100/llm_research/tensorRT/new/TensorRT-LLM/examples/llama/

model_name="meta-llama/Llama-2-7b-hf"
dir_1="/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9"
# dir_2="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_weights/Llama2-7b"
# dir_3="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_binaries/Llama2-7b"
dir_2="."
dir_3="."


for tensor_parallel in 1; do
for precision in "full_prec" "int8_sq" "int4_awq"; do
for kv_cache_precision in "int8" "fp8"; do
# rm -rf $dir_2/*
# rm -rf $dir_3/*
python ../quantization/quantize.py --model_dir $dir_1 --dtype float16 --qformat $precision --kv_cache_dtype $kv_cache_precision --output_dir $dir_2 --calib_size 10 --tp_size $tensor_parallel --batch_size=1
for batch_size in 1 16 32 64; do
for input_output_length in 1024; do
trtllm-build --workers=48 --tp_size=$tensor_parallel --checkpoint_dir=$dir_2 --output_dir=$dir_3 --max_batch_size=$batch_size --max_input_len=$input_output_length
mpirun -np $tensor_parallel python3 ../run_precision.py --qformat $precision --kv_cache_dtype $kv_cache_precision --model_name=$model_name --tp_size=$tensor_parallel --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=$input_output_length --max_input_length=$input_output_length --run_profiling --batch_size=$batch_size
done
done
done
done
done
Loading

0 comments on commit 47d0f04

Please sign in to comment.