trtllm h100 content

argonne-lcf · Sep 8, 2024 · 47d0f04 · 47d0f04
1 parent afdf070
commit 47d0f04
Show file tree

Hide file tree

Showing 7 changed files with 2,230 additions and 0 deletions.
diff --git a/TensorRT-LLM/H100/README.MD b/TensorRT-LLM/H100/README.MD
@@ -0,0 +1,56 @@
+# TRT-LLM on H100
+
+1. Setup Virtual Environment 
+
+    ```bash
+    module use /soft/modulefiles/
+    module load conda
+    module load openmpi/4.1.1-nvhpc
+
+    conda create -n TensorRT_LLM python=3.10
+    conda activate TensorRT_LLM
+    conda install -c conda-forge mpi4py openmpi
+
+    ```
+
+2. Install Dependancies
+    ```bash
+    git clone https://github.com/NVIDIA/TensorRT-LLM.git
+
+    cd TensorRT-LLM
+    cd examples/llama/
+
+    MPICC=$(which mpicc) MPICXX=$(which mpicxx) pip install -r requirements.txt
+    ```
+
+3. Running single Benchmark
+   ```bash
+
+    export dir_1=<HF weights path>
+    export dir_2=<trt weights path>
+    export dir_3=<trt engines path>
+
+    python convert_checkpoint.py --tp_size=1 --model_dir=$dir_1 --output_dir=$dir_2 --dtype=float16
+
+    trtllm-build --checkpoint_dir=$dir_2 --output_dir=$dir_3 --gemm_plugin=float16 --max_batch_size=1 --max_input_len=128 --max_output_len=128
+
+    python3 ../run.py --model_name="mistral_7b" --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=128 --max_input_length=$input_output_length --run_profiling --batch_size=1 
+   ```
+
+4. Replaces or Copy files `run_power.py`, `run_precision_bench.py`, `utils.py` and `run.py` from this directory to clones trt-llm directory. 
+
+5. Run benchmarks. 
+Use `p-llama2-7b.sh` to run power benchmakrs. 
+Use `q-llama2-7b.sh` to run precision benchmarks. 
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/TensorRT-LLM/H100/p-llama2-7b.sh b/TensorRT-LLM/H100/p-llama2-7b.sh
@@ -0,0 +1,32 @@
+export HF_TOKEN="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"
+export HF_HOME="/vast/users/sraskar/mi250/hf/hub"
+export HF_DATASETS_CACHE="/vast/users/sraskar/mi250/hf/hub"
+
+pip install pynvml==11.5.0
+pip install pydantic-core==2.18.1
+pip install psutil
+pip install py3nvml
+
+cd /vast/users/sraskar/h100/llm_research/tensorRT/new/TensorRT-LLM/examples/llama/
+
+model_name="meta-llama/Llama-2-7b-hf"
+dir_1="/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9"
+dir_2="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_weights/Llama2-7b"
+dir_3="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_binaries/Llama2-7b"
+
+
+
+
+for tensor_parallel in 1; do
+    for precision in "float16"; do
+        rm -rf $dir_2/*
+        rm -rf $dir_3/*
+        python convert_checkpoint.py --workers=64 --tp_size=$tensor_parallel --model_dir=$dir_1 --output_dir=$dir_2 --dtype=$precision
+        for batch_size in 1 16 32 64; do
+            for input_output_length in 1024; do
+                trtllm-build --workers=64 --tp_size=$tensor_parallel --checkpoint_dir=$dir_2 --output_dir=$dir_3 --gemm_plugin=$precision --gpt_attention_plugin=$precision --max_batch_size=$batch_size --max_input_len=$input_output_length
+                mpirun -np $tensor_parallel python3 ../run_power.py --model_name=$model_name --tp_size=$tensor_parallel --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=$input_output_length --max_input_length=$input_output_length --run_profiling --batch_size=$batch_size 
+            done
+        done
+    done
+done
diff --git a/TensorRT-LLM/H100/q-llama2-7b.sh b/TensorRT-LLM/H100/q-llama2-7b.sh
@@ -0,0 +1,37 @@
+export HF_TOKEN="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"
+export HF_HOME="/vast/users/sraskar/mi250/hf/hub"
+export HF_DATASETS_CACHE="/vast/users/sraskar/mi250/hf/hub"
+
+pip install pynvml==11.5.0
+pip install pydantic-core==2.18.1
+# pip install psutil
+pip install psutil==5.9.8
+
+pip install pydantic==2.7.0
+pip install regex==2024.5.15
+
+cd /vast/users/sraskar/h100/llm_research/tensorRT/new/TensorRT-LLM/examples/llama/
+
+model_name="meta-llama/Llama-2-7b-hf"
+dir_1="/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9"
+# dir_2="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_weights/Llama2-7b"
+# dir_3="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_binaries/Llama2-7b"
+dir_2="."
+dir_3="."
+
+
+for tensor_parallel in 1; do
+    for precision in "full_prec" "int8_sq" "int4_awq"; do
+        for kv_cache_precision in "int8" "fp8"; do
+            # rm -rf $dir_2/*
+            # rm -rf $dir_3/*
+            python ../quantization/quantize.py --model_dir $dir_1 --dtype float16 --qformat $precision --kv_cache_dtype $kv_cache_precision --output_dir $dir_2 --calib_size 10 --tp_size $tensor_parallel --batch_size=1
+            for batch_size in 1 16 32 64; do
+                for input_output_length in 1024; do
+                    trtllm-build --workers=48 --tp_size=$tensor_parallel --checkpoint_dir=$dir_2 --output_dir=$dir_3 --max_batch_size=$batch_size --max_input_len=$input_output_length
+                    mpirun -np $tensor_parallel python3 ../run_precision.py --qformat $precision --kv_cache_dtype $kv_cache_precision --model_name=$model_name --tp_size=$tensor_parallel --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=$input_output_length --max_input_length=$input_output_length --run_profiling --batch_size=$batch_size
+                done
+            done
+        done
+    done
+done