diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 82f9275..0000000 --- a/.gitignore +++ /dev/null @@ -1,162 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ diff --git a/Deepspeed-MII/README.md b/Deepspeed-MII/README.md deleted file mode 100644 index 6a1d524..0000000 --- a/Deepspeed-MII/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# Deepspeed-MII -Deepspeed-MII diff --git a/InferenceGraphPlotter/README.md b/InferenceGraphPlotter/README.md deleted file mode 100644 index 182ea8e..0000000 --- a/InferenceGraphPlotter/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# InferenceGraphPlotter - -## How to run? -1. Clone the repo and cd into the repo -2. Spin up a simple webserver to serve the files. One way is by using python. - - for python 2: python -m SimpleHTTPServer - - for python 3: python -m http.server -3. Open a webbrowser and go to http://localhost:8000 \ No newline at end of file diff --git a/README.md b/README.md index db4a354..182ea8e 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,8 @@ -# LLaMA-Inference-Bench - -LLM-Inference-Bench: Inference Benchmarking of Large Language Models on AI Accelerators - -## Metrix of Evaluated Frameworks and Hardwares : - -| Framework/ Hardware | NVIDIA A100 | NVIDIA H100 | NVIDIA GH200 | AMD MI250 | Intel PVC | Habana Gaudi2 | Sambanova SN40L | -|:-----------------------:|:---------------:|:---------------:|:------------:|:---------:|:---------:|:-------------:|:---------------:| -| [vLLM](./vLLM/README.md) | [Link]() | [Link]() | Yes | [Link]() | [Link]() | No | N/A | -| [llama.cpp](./llama.cpp/README.md) | [Link]() | [Link]() | Yes | [Link]() | [Link]() | N/A | N/A | -| [TensorRT-LLM](./TensorRT-LLM/README.md) | [Link]() | [Link]() | [Link]() | N/A | N/A | N/A | N/A | -| [DeepSpeed-MII](./Deepspeed-MII/README.md) | No | No | No | No | No | [Link]() | N/A | - -## Key Insights - - - Cite this work: - ``` - @INPROCEEDINGS{####, - author={Krishna Teja Chitty-Venkata and Siddhisanket Raskar and Bharat Kale and Farah Ferdaus and Aditya Tanikanti and Ken Raffenetti and Valerie Taylor and Murali Emani and Venkatram Vishwanath}, - booktitle={2024 IEEE/ACM International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)}, - title={LLM-Inference-Bench: Inference Benchmarking of Large Language Models on AI Accelerators}, - year={2024}, - volume={}, - number={}, - pages={}, - keywords={Large Language Models, AI Accelerators, Performance Evaluation, Benchmarking }, - doi={}} - ``` +# InferenceGraphPlotter + +## How to run? +1. Clone the repo and cd into the repo +2. Spin up a simple webserver to serve the files. One way is by using python. + - for python 2: python -m SimpleHTTPServer + - for python 3: python -m http.server +3. Open a webbrowser and go to http://localhost:8000 \ No newline at end of file diff --git a/TensorRT-LLM/A100/README.MD b/TensorRT-LLM/A100/README.MD deleted file mode 100644 index e69de29..0000000 diff --git a/TensorRT-LLM/GH200/README.MD b/TensorRT-LLM/GH200/README.MD deleted file mode 100644 index e69de29..0000000 diff --git a/TensorRT-LLM/H100/README.MD b/TensorRT-LLM/H100/README.MD deleted file mode 100644 index 1c2d0c6..0000000 --- a/TensorRT-LLM/H100/README.MD +++ /dev/null @@ -1,56 +0,0 @@ -# TRT-LLM on H100 - -1. Setup Virtual Environment - - ```bash - module use /soft/modulefiles/ - module load conda - module load openmpi/4.1.1-nvhpc - - conda create -n TensorRT_LLM python=3.10 - conda activate TensorRT_LLM - conda install -c conda-forge mpi4py openmpi - - ``` - -2. Install Dependancies - ```bash - git clone https://github.com/NVIDIA/TensorRT-LLM.git - - cd TensorRT-LLM - cd examples/llama/ - - MPICC=$(which mpicc) MPICXX=$(which mpicxx) pip install -r requirements.txt - ``` - -3. Running single Benchmark - ```bash - - export dir_1= - export dir_2= - export dir_3= - - python convert_checkpoint.py --tp_size=1 --model_dir=$dir_1 --output_dir=$dir_2 --dtype=float16 - - trtllm-build --checkpoint_dir=$dir_2 --output_dir=$dir_3 --gemm_plugin=float16 --max_batch_size=1 --max_input_len=128 --max_output_len=128 - - python3 ../run.py --model_name="mistral_7b" --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=128 --max_input_length=$input_output_length --run_profiling --batch_size=1 - ``` - -4. Replaces or Copy files `run_power.py`, `run_precision_bench.py`, `utils.py` and `run.py` from this directory to clones trt-llm directory. - -5. Run benchmarks. -Use `p-llama2-7b.sh` to run power benchmakrs. -Use `q-llama2-7b.sh` to run precision benchmarks. - - - - - - - - - - - - diff --git a/TensorRT-LLM/H100/p-llama2-7b.sh b/TensorRT-LLM/H100/p-llama2-7b.sh deleted file mode 100755 index 2b846ca..0000000 --- a/TensorRT-LLM/H100/p-llama2-7b.sh +++ /dev/null @@ -1,32 +0,0 @@ -export HF_TOKEN="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" -export HF_HOME="/vast/users/sraskar/mi250/hf/hub" -export HF_DATASETS_CACHE="/vast/users/sraskar/mi250/hf/hub" - -pip install pynvml==11.5.0 -pip install pydantic-core==2.18.1 -pip install psutil -pip install py3nvml - -cd /vast/users/sraskar/h100/llm_research/tensorRT/new/TensorRT-LLM/examples/llama/ - -model_name="meta-llama/Llama-2-7b-hf" -dir_1="/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9" -dir_2="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_weights/Llama2-7b" -dir_3="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_binaries/Llama2-7b" - - - - -for tensor_parallel in 1; do - for precision in "float16"; do - rm -rf $dir_2/* - rm -rf $dir_3/* - python convert_checkpoint.py --workers=64 --tp_size=$tensor_parallel --model_dir=$dir_1 --output_dir=$dir_2 --dtype=$precision - for batch_size in 1 16 32 64; do - for input_output_length in 1024; do - trtllm-build --workers=64 --tp_size=$tensor_parallel --checkpoint_dir=$dir_2 --output_dir=$dir_3 --gemm_plugin=$precision --gpt_attention_plugin=$precision --max_batch_size=$batch_size --max_input_len=$input_output_length - mpirun -np $tensor_parallel python3 ../run_power.py --model_name=$model_name --tp_size=$tensor_parallel --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=$input_output_length --max_input_length=$input_output_length --run_profiling --batch_size=$batch_size - done - done - done -done \ No newline at end of file diff --git a/TensorRT-LLM/H100/q-llama2-7b.sh b/TensorRT-LLM/H100/q-llama2-7b.sh deleted file mode 100755 index 60249ce..0000000 --- a/TensorRT-LLM/H100/q-llama2-7b.sh +++ /dev/null @@ -1,37 +0,0 @@ -export HF_TOKEN="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" -export HF_HOME="/vast/users/sraskar/mi250/hf/hub" -export HF_DATASETS_CACHE="/vast/users/sraskar/mi250/hf/hub" - -pip install pynvml==11.5.0 -pip install pydantic-core==2.18.1 -# pip install psutil -pip install psutil==5.9.8 - -pip install pydantic==2.7.0 -pip install regex==2024.5.15 - -cd /vast/users/sraskar/h100/llm_research/tensorRT/new/TensorRT-LLM/examples/llama/ - -model_name="meta-llama/Llama-2-7b-hf" -dir_1="/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9" -# dir_2="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_weights/Llama2-7b" -# dir_3="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_binaries/Llama2-7b" -dir_2="." -dir_3="." - - -for tensor_parallel in 1; do - for precision in "full_prec" "int8_sq" "int4_awq"; do - for kv_cache_precision in "int8" "fp8"; do - # rm -rf $dir_2/* - # rm -rf $dir_3/* - python ../quantization/quantize.py --model_dir $dir_1 --dtype float16 --qformat $precision --kv_cache_dtype $kv_cache_precision --output_dir $dir_2 --calib_size 10 --tp_size $tensor_parallel --batch_size=1 - for batch_size in 1 16 32 64; do - for input_output_length in 1024; do - trtllm-build --workers=48 --tp_size=$tensor_parallel --checkpoint_dir=$dir_2 --output_dir=$dir_3 --max_batch_size=$batch_size --max_input_len=$input_output_length - mpirun -np $tensor_parallel python3 ../run_precision.py --qformat $precision --kv_cache_dtype $kv_cache_precision --model_name=$model_name --tp_size=$tensor_parallel --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=$input_output_length --max_input_length=$input_output_length --run_profiling --batch_size=$batch_size - done - done - done - done -done \ No newline at end of file diff --git a/TensorRT-LLM/H100/run.py b/TensorRT-LLM/H100/run.py deleted file mode 100644 index f1994d3..0000000 --- a/TensorRT-LLM/H100/run.py +++ /dev/null @@ -1,550 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import ast -import csv -import os -from pathlib import Path - -import numpy as np -import torch -from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, - add_common_args, load_tokenizer, read_decoder_start_token_id, - read_model_name, supports_inflight_batching, - throttle_generator) - -import tensorrt_llm -import tensorrt_llm.profiler -from tensorrt_llm.logger import logger -from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner - -if PYTHON_BINDINGS: - from tensorrt_llm.runtime import ModelRunnerCpp - - -def parse_arguments(args=None): - # see `add_common_args` for extended list of arguments - parser = argparse.ArgumentParser() - - parser.add_argument('--pp_size', type=int, default = 1) - parser.add_argument('--tp_size', type=int, default = 1) - parser.add_argument('--moe_ep_size', type=int, default = 1) - parser.add_argument('--moe_tp_size', type=int, default = 1) - parser.add_argument('--model_name', type=str, required=True) - parser.add_argument('--batch_size', type=int, default=1, help='Batch Size') - parser.add_argument('--precision', type=str, default="float16", help="precision") - parser.add_argument('--int8_kv_cache', default=False, action='store_true', help="Int8 KV Cache.") - - parser.add_argument('--qformat', type=str, default="float16", help="precision") - parser.add_argument('--kv_cache_dtype', type=str, default="float16", help="precision") - - parser.add_argument('--max_input_length', type=int, default=923) - parser.add_argument('--max_output_len', type=int, required=True) - parser.add_argument( - '--input_text', - type=str, - nargs='+', - default=["Born in north-east France, Soyer trained as a"]) - parser.add_argument( - '--input_file', - type=str, - help= - 'CSV or Numpy file containing tokenized input. Alternative to text input.', - default=None) - parser.add_argument('--output_csv', - type=str, - help='CSV file where the tokenized output is stored.', - default=None) - parser.add_argument('--output_npy', - type=str, - help='Numpy file where the tokenized output is stored.', - default=None) - parser.add_argument( - '--output_logits_npy', - type=str, - help= - 'Numpy file where the generation logits are stored. Use only when num_beams==1', - default=None) - parser.add_argument('--output_log_probs_npy', - type=str, - help='Numpy file where the log_probs are stored', - default=None) - parser.add_argument('--output_cum_log_probs_npy', - type=str, - help='Numpy file where the cum_log_probs are stored', - default=None) - parser.add_argument( - '--run_profiling', - default=False, - action='store_true', - help="Run several 10 iterations to profile the inference latencies.") - parser = add_common_args(parser) - - return parser.parse_args(args=args) - - -def parse_input(tokenizer, - input_text=None, - prompt_template=None, - input_file=None, - add_special_tokens=True, - max_input_length=923, - pad_id=None, - num_prepend_vtokens=[], - model_name=None, - model_version=None): - if pad_id is None: - pad_id = tokenizer.pad_token_id - - batch_input_ids = [] - if input_file is None: - for curr_text in input_text: - if prompt_template is not None: - curr_text = prompt_template.format(input_text=curr_text) - input_ids = tokenizer.encode(curr_text, - add_special_tokens=add_special_tokens, - truncation=True, - max_length=max_input_length) - batch_input_ids.append(input_ids) - else: - if input_file.endswith('.csv'): - with open(input_file, 'r') as csv_file: - csv_reader = csv.reader(csv_file, delimiter=',') - for line in csv_reader: - input_ids = np.array(line, dtype='int32') - batch_input_ids.append(input_ids[-max_input_length:]) - elif input_file.endswith('.npy'): - inputs = np.load(input_file) - for row in inputs: - input_ids = row[row != pad_id] - batch_input_ids.append(input_ids[-max_input_length:]) - elif input_file.endswith('.txt'): - with open(input_file, 'r', encoding='utf-8', - errors='replace') as txt_file: - input_text = txt_file.readlines() - batch_input_ids = tokenizer( - input_text, - add_special_tokens=add_special_tokens, - truncation=True, - max_length=max_input_length)["input_ids"] - else: - print('Input file format not supported.') - raise SystemExit - - if num_prepend_vtokens: - assert len(num_prepend_vtokens) == len(batch_input_ids) - base_vocab_size = tokenizer.vocab_size - len( - tokenizer.special_tokens_map.get('additional_special_tokens', [])) - for i, length in enumerate(num_prepend_vtokens): - batch_input_ids[i] = list( - range(base_vocab_size, - base_vocab_size + length)) + batch_input_ids[i] - - if input_file is None and 'GLM' in model_name and model_version == 'glm': - for ids in batch_input_ids: - ids.append(tokenizer.sop_token_id) - - batch_input_ids = [ - torch.tensor(x, dtype=torch.int32) for x in batch_input_ids - ] - return batch_input_ids - - -def print_output(tokenizer, - output_ids, - input_lengths, - sequence_lengths, - output_csv=None, - output_npy=None, - context_logits=None, - generation_logits=None, - cum_log_probs=None, - log_probs=None, - output_logits_npy=None, - output_cum_log_probs_npy=None, - output_log_probs_npy=None): - batch_size, num_beams, _ = output_ids.size() - if output_csv is None and output_npy is None: - for batch_idx in range(batch_size): - inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist( - ) - input_text = tokenizer.decode(inputs) - print(f'Input [Text {batch_idx}]: \"{input_text}\"') - for beam in range(num_beams): - output_begin = input_lengths[batch_idx] - output_end = sequence_lengths[batch_idx][beam] - outputs = output_ids[batch_idx][beam][ - output_begin:output_end].tolist() - output_text = tokenizer.decode(outputs) - print( - f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"') - - output_ids = output_ids.reshape((-1, output_ids.size(2))) - if output_csv is not None: - output_file = Path(output_csv) - output_file.parent.mkdir(exist_ok=True, parents=True) - outputs = output_ids.tolist() - with open(output_file, 'w') as csv_file: - writer = csv.writer(csv_file, delimiter=',') - writer.writerows(outputs) - - if output_npy is not None: - output_file = Path(output_npy) - output_file.parent.mkdir(exist_ok=True, parents=True) - outputs = np.array(output_ids.cpu().contiguous(), dtype='int32') - np.save(output_file, outputs) - - # Save context logits - if context_logits is not None and output_logits_npy is not None: - context_logits = torch.cat(context_logits, axis=0) - vocab_size_padded = context_logits.shape[-1] - context_logits = context_logits.reshape([1, -1, vocab_size_padded]) - - output_context_logits_npy = output_logits_npy.split( - '.npy')[0] + "_context" - output_context_logits_file = Path(output_context_logits_npy) - context_outputs = np.array( - context_logits.squeeze(0).cpu().contiguous(), - dtype='float32') # [promptLengthSum, vocabSize] - np.save(output_context_logits_file, context_outputs) - - # Save generation logits - if generation_logits is not None and output_logits_npy is not None and num_beams == 1: - output_generation_logits_npy = output_logits_npy.split( - '.npy')[0] + "_generation" - output_generation_logits_file = Path(output_generation_logits_npy) - generation_outputs = np.array(generation_logits.cpu().contiguous(), - dtype='float32') - np.save(output_generation_logits_file, generation_outputs) - - # Save cum log probs - if cum_log_probs is not None and output_cum_log_probs_npy is not None: - cum_log_probs_file = Path(output_cum_log_probs_npy) - cum_log_probs_outputs = np.array(cum_log_probs.cpu().contiguous(), - dtype='float32') - np.save(cum_log_probs_file, cum_log_probs_outputs) - - # Save cum log probs - if log_probs is not None and output_log_probs_npy is not None: - log_probs_file = Path(output_log_probs_npy) - log_probs_outputs = np.array(log_probs.cpu().contiguous(), - dtype='float32') - np.save(log_probs_file, log_probs_outputs) - - -def main(args): - runtime_rank = tensorrt_llm.mpi_rank() - logger.set_level(args.log_level) - - # different handling if encoder-decoder models - is_enc_dec = { - name - for name in os.listdir(args.engine_dir) - if os.path.isdir(os.path.join(args.engine_dir, name)) - } == {'encoder', 'decoder'} - if is_enc_dec: - logger.warning( - "This path is an encoder-decoder model. Using different handling.") - assert not args.use_py_session, "Encoder-decoder models don't have a unified python runtime, please use its own examples/enc_dec/run.py instead." - - model_name, model_version = read_model_name( - args.engine_dir) if not is_enc_dec else ("", "") - if args.tokenizer_dir is None and model_name in DEFAULT_HF_MODEL_DIRS: - logger.warning( - "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect." - ) - args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name] - - tokenizer, pad_id, end_id = load_tokenizer( - tokenizer_dir=args.tokenizer_dir, - vocab_file=args.vocab_file, - model_name=model_name, - model_version=model_version, - tokenizer_type=args.tokenizer_type, - ) - - if args.end_id: - end_id = args.end_id - - stop_words_list = None - if args.stop_words: - stop_words_list = tensorrt_llm.runtime.decode_words_list( - args.stop_words, tokenizer) - - bad_words_list = None - if args.bad_words: - bad_words_list = tensorrt_llm.runtime.decode_words_list( - args.bad_words, tokenizer) - - prompt_template = None - if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES: - prompt_template = DEFAULT_PROMPT_TEMPLATES[model_name] - batch_input_ids = parse_input(tokenizer=tokenizer, - input_text=args.input_text, - prompt_template=prompt_template, - input_file=args.input_file, - add_special_tokens=args.add_special_tokens, - max_input_length=args.max_input_length, - pad_id=pad_id, - num_prepend_vtokens=args.num_prepend_vtokens, - model_name=model_name, - model_version=model_version) - - if is_enc_dec: - encoder_input_ids = batch_input_ids - decoder_start_token_id = read_decoder_start_token_id( - os.path.join(args.engine_dir, "decoder")) - decoder_input_ids = [ - torch.tensor([decoder_start_token_id], dtype=torch.int32) - for _ in batch_input_ids - ] - - input_lengths = [x.size(0) for x in decoder_input_ids - ] if is_enc_dec else [x.size(0) for x in batch_input_ids] - encoder_input_lengths = [x.size(0) - for x in encoder_input_ids] if is_enc_dec else None - - if not args.use_py_session and not supports_inflight_batching( - os.path.join(args.engine_dir, "decoder") if is_enc_dec else args. - engine_dir): - logger.warning( - "The given engine does not support in-flight batching, fallback to python session" - ) - args.use_py_session = True - - if not PYTHON_BINDINGS and not args.use_py_session: - logger.warning( - "Python bindings of C++ session is unavailable, fallback to Python session." - ) - args.use_py_session = True - if args.debug_mode and not args.use_py_session: - logger.warning( - "Debug mode is not supported in C++ session for now, fallback to Python session." - ) - args.use_py_session = True - if args.return_all_generated_tokens and args.use_py_session: - raise ValueError( - "Returning all the generated tokens at each step is not supported in the Python session, use C++ session instead." - ) - if (not args.return_all_generated_tokens) and args.streaming and ( - args.num_beams > 1): - logger.warning( - "Setting return_all_generated_tokens to True since streaming AND beam search are done simultaneously. " - "Returning the full beams at each streaming step is needed because beam search + streaming can change previous outputs. " - "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)." - ) - args.return_all_generated_tokens = True - runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp - runner_kwargs = dict( - engine_dir=args.engine_dir, - lora_dir=args.lora_dir, - rank=runtime_rank, - debug_mode=args.debug_mode, - lora_ckpt_source=args.lora_ckpt_source, - gpu_weights_percent=args.gpu_weights_percent, - ) - if not args.use_py_session: - runner_kwargs.update(is_enc_dec=is_enc_dec) - if args.medusa_choices is not None: - args.medusa_choices = ast.literal_eval(args.medusa_choices) - assert args.temperature == 1.0, "Medusa should use temperature == 1.0" - assert args.num_beams == 1, "Medusa should use num_beams == 1" - runner_kwargs.update(medusa_choices=args.medusa_choices) - if not args.use_py_session: - runner_kwargs.update( - max_batch_size=len(batch_input_ids), - max_input_len=max( - encoder_input_lengths if is_enc_dec else input_lengths), - max_output_len=args.max_output_len, - max_beam_width=args.num_beams, - max_attention_window_size=args.max_attention_window_size, - sink_token_length=args.sink_token_length, - max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, - kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, - kv_cache_free_gpu_memory_fraction=args. - kv_cache_free_gpu_memory_fraction, - enable_chunked_context=args.enable_chunked_context, - multi_block_mode=args.multi_block_mode) - runner = runner_cls.from_dir(**runner_kwargs) - - with torch.no_grad(): - outputs = runner.generate( - batch_input_ids=decoder_input_ids - if is_enc_dec else batch_input_ids, - encoder_input_ids=encoder_input_ids if is_enc_dec else None, - max_new_tokens=args.max_output_len, - max_attention_window_size=args.max_attention_window_size, - sink_token_length=args.sink_token_length, - end_id=end_id, - pad_id=pad_id, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - length_penalty=args.length_penalty, - early_stopping=args.early_stopping, - repetition_penalty=args.repetition_penalty, - presence_penalty=args.presence_penalty, - frequency_penalty=args.frequency_penalty, - stop_words_list=stop_words_list, - bad_words_list=bad_words_list, - output_cum_log_probs=(args.output_cum_log_probs_npy != None), - output_log_probs=(args.output_log_probs_npy != None), - random_seed=args.random_seed, - lora_uids=args.lora_task_uids, - prompt_table=args.prompt_table_path, - prompt_tasks=args.prompt_tasks, - streaming=args.streaming, - output_sequence_lengths=True, - no_repeat_ngram_size=args.no_repeat_ngram_size, - return_dict=True, - medusa_choices=args.medusa_choices, - return_all_generated_tokens=args.return_all_generated_tokens) - torch.cuda.synchronize() - - if args.streaming: - for curr_outputs in throttle_generator(outputs, - args.streaming_interval): - if runtime_rank == 0: - output_ids = curr_outputs['output_ids'] - sequence_lengths = curr_outputs['sequence_lengths'] - cum_log_probs = None - log_probs = None - if args.output_cum_log_probs_npy != None: - cum_log_probs = outputs['cum_log_probs'] - if args.output_log_probs_npy != None: - log_probs = outputs['log_probs'] - print_output( - tokenizer, - output_ids, - input_lengths, - sequence_lengths, - output_csv=args.output_csv, - output_npy=args.output_npy, - cum_log_probs=cum_log_probs, - log_probs=log_probs, - output_cum_log_probs_npy=args.output_cum_log_probs_npy, - output_log_probs_npy=args.output_log_probs_npy) - else: - if runtime_rank == 0: - output_ids = outputs['output_ids'] - sequence_lengths = outputs['sequence_lengths'] - context_logits = None - generation_logits = None - cum_log_probs = None - log_probs = None - if runner.gather_context_logits: - context_logits = outputs['context_logits'] - if runner.gather_generation_logits: - generation_logits = outputs['generation_logits'] - if args.output_cum_log_probs_npy != None: - cum_log_probs = outputs['cum_log_probs'] - if args.output_log_probs_npy != None: - log_probs = outputs['log_probs'] - print_output(tokenizer, - output_ids, - input_lengths, - sequence_lengths, - output_csv=args.output_csv, - output_npy=args.output_npy, - context_logits=context_logits, - generation_logits=generation_logits, - output_logits_npy=args.output_logits_npy, - cum_log_probs=cum_log_probs, - log_probs=log_probs, - output_cum_log_probs_npy=args.output_cum_log_probs_npy, - output_log_probs_npy=args.output_log_probs_npy) - - if args.run_profiling: - ite = 1 - # warmup - for _ in range(ite): - with torch.no_grad(): - outputs = runner.generate( - batch_input_ids, - max_new_tokens=args.max_output_len, - max_attention_window_size=args.max_attention_window_size, - end_id=end_id, - pad_id=pad_id, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - length_penalty=args.length_penalty, - early_stopping=args.early_stopping, - repetition_penalty=args.repetition_penalty, - presence_penalty=args.presence_penalty, - frequency_penalty=args.frequency_penalty, - stop_words_list=stop_words_list, - bad_words_list=bad_words_list, - output_cum_log_probs=(args.output_cum_log_probs_npy != - None), - output_log_probs=(args.output_log_probs_npy != None), - random_seed=args.random_seed, - lora_uids=args.lora_task_uids, - prompt_table=args.prompt_table_path, - prompt_tasks=args.prompt_tasks, - streaming=args.streaming, - output_sequence_lengths=True, - return_dict=True, - return_all_generated_tokens=args.return_all_generated_tokens - ) - torch.cuda.synchronize() - - tensorrt_llm.profiler.start("tmp") - ite=1 - for _ in range(ite): - with torch.no_grad(): - outputs = runner.generate( - batch_input_ids, - max_new_tokens=args.max_output_len, - max_attention_window_size=args.max_attention_window_size, - end_id=end_id, - pad_id=pad_id, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - length_penalty=args.length_penalty, - early_stopping=args.early_stopping, - repetition_penalty=args.repetition_penalty, - presence_penalty=args.presence_penalty, - frequency_penalty=args.frequency_penalty, - stop_words_list=stop_words_list, - bad_words_list=bad_words_list, - output_cum_log_probs=(args.output_cum_log_probs_npy != - None), - output_log_probs=(args.output_log_probs_npy != None), - random_seed=args.random_seed, - lora_uids=args.lora_task_uids, - prompt_table=args.prompt_table_path, - prompt_tasks=args.prompt_tasks, - streaming=args.streaming, - output_sequence_lengths=True, - return_dict=True, - return_all_generated_tokens=args.return_all_generated_tokens - ) - torch.cuda.synchronize() - tensorrt_llm.profiler.stop("tmp") - - print( - f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec" - ) - - -if __name__ == '__main__': - args = parse_arguments() - main(args) diff --git a/TensorRT-LLM/H100/run_power.py b/TensorRT-LLM/H100/run_power.py deleted file mode 100644 index 9b3de62..0000000 --- a/TensorRT-LLM/H100/run_power.py +++ /dev/null @@ -1,619 +0,0 @@ - -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import ast -import csv -import os -from pathlib import Path - -# from huggingface_hub import login -# login("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") - -from power_utils import gpuPowerProbe -power_profile = gpuPowerProbe(interval=0.10) - - -import numpy as np -import torch -from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, - add_common_args, load_tokenizer, read_model_name, - throttle_generator) - -import tensorrt_llm -import tensorrt_llm.profiler -from tensorrt_llm.logger import logger -from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner - -if PYTHON_BINDINGS: - from tensorrt_llm.runtime import ModelRunnerCpp - - -def parse_arguments(args=None): - parser = argparse.ArgumentParser() - - parser.add_argument('--pp_size', type=int, default = 1) - parser.add_argument('--tp_size', type=int, default = 1) - parser.add_argument('--moe_ep_size', type=int, default = 1) - parser.add_argument('--moe_tp_size', type=int, default = 1) - parser.add_argument('--model_name', type=str, required=True) - parser.add_argument('--batch_size', type=int, default=1, help='Batch Size') - parser.add_argument('--precision', type=str, default="float16", help="precision") - parser.add_argument('--int8_kv_cache', default=False, action='store_true', help="Int8 KV Cache.") - - parser.add_argument('--max_input_length', type=int, default=923) - parser.add_argument('--max_output_len', type=int, required=True) - parser.add_argument( - '--input_text', - type=str, - nargs='+', - default=["Born in north-east France, Soyer trained as a"]) - parser.add_argument( - '--input_file', - type=str, - help= - 'CSV or Numpy file containing tokenized input. Alternative to text input.', - default=None) - parser.add_argument('--output_csv', - type=str, - help='CSV file where the tokenized output is stored.', - default=None) - parser.add_argument('--output_npy', - type=str, - help='Numpy file where the tokenized output is stored.', - default=None) - parser.add_argument( - '--output_logits_npy', - type=str, - help= - 'Numpy file where the generation logits are stored. Use only when num_beams==1', - default=None) - parser.add_argument('--output_log_probs_npy', - type=str, - help='Numpy file where the log_probs are stored', - default=None) - parser.add_argument('--output_cum_log_probs_npy', - type=str, - help='Numpy file where the cum_log_probs are stored', - default=None) - parser.add_argument( - '--run_profiling', - default=False, - action='store_true', - help="Run several 10 iterations to profile the inference latencies.") - parser = add_common_args(parser) - - return parser.parse_args(args=args) - - -def parse_input(tokenizer, - input_text=None, - prompt_template=None, - input_file=None, - add_special_tokens=True, - max_input_length=923, - pad_id=None, - num_prepend_vtokens=[], - model_name=None, - model_version=None): - if pad_id is None: - pad_id = tokenizer.pad_token_id - - batch_input_ids = [] - if input_file is None: - for curr_text in input_text: - if prompt_template is not None: - curr_text = prompt_template.format(input_text=curr_text) - input_ids = tokenizer.encode(curr_text, - add_special_tokens=add_special_tokens, - truncation=True, - max_length=max_input_length) - batch_input_ids.append(input_ids) - else: - if input_file.endswith('.csv'): - with open(input_file, 'r') as csv_file: - csv_reader = csv.reader(csv_file, delimiter=',') - for line in csv_reader: - input_ids = np.array(line, dtype='int32') - batch_input_ids.append(input_ids[-max_input_length:]) - elif input_file.endswith('.npy'): - inputs = np.load(input_file) - for row in inputs: - input_ids = row[row != pad_id] - batch_input_ids.append(input_ids[-max_input_length:]) - elif input_file.endswith('.txt'): - with open(input_file, 'r', encoding='utf-8', - errors='replace') as txt_file: - input_text = txt_file.readlines() - batch_input_ids = tokenizer( - input_text, - add_special_tokens=add_special_tokens, - truncation=True, - max_length=max_input_length)["input_ids"] - else: - print('Input file format not supported.') - raise SystemExit - - if num_prepend_vtokens: - assert len(num_prepend_vtokens) == len(batch_input_ids) - base_vocab_size = tokenizer.vocab_size - len( - tokenizer.special_tokens_map.get('additional_special_tokens', [])) - for i, length in enumerate(num_prepend_vtokens): - batch_input_ids[i] = list( - range(base_vocab_size, - base_vocab_size + length)) + batch_input_ids[i] - - if model_name == 'ChatGLMForCausalLM' and model_version == 'glm': - for ids in batch_input_ids: - ids.append(tokenizer.sop_token_id) - - batch_input_ids = [ - torch.tensor(x, dtype=torch.int32) for x in batch_input_ids - ] - return batch_input_ids - - -def print_output(tokenizer, - output_ids, - input_lengths, - sequence_lengths, - output_csv=None, - output_npy=None, - context_logits=None, - generation_logits=None, - cum_log_probs=None, - log_probs=None, - output_logits_npy=None, - output_cum_log_probs_npy=None, - output_log_probs_npy=None): - batch_size, num_beams, _ = output_ids.size() - if output_csv is None and output_npy is None: - for batch_idx in range(batch_size): - inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist( - ) - input_text = tokenizer.decode(inputs) - print(f'Input [Text {batch_idx}]: \"{input_text}\"') - for beam in range(num_beams): - output_begin = input_lengths[batch_idx] - output_end = sequence_lengths[batch_idx][beam] - outputs = output_ids[batch_idx][beam][ - output_begin:output_end].tolist() - output_text = tokenizer.decode(outputs) - print( - f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"') - - output_ids = output_ids.reshape((-1, output_ids.size(2))) - if output_csv is not None: - output_file = Path(output_csv) - output_file.parent.mkdir(exist_ok=True, parents=True) - outputs = output_ids.tolist() - with open(output_file, 'w') as csv_file: - writer = csv.writer(csv_file, delimiter=',') - writer.writerows(outputs) - - if output_npy is not None: - output_file = Path(output_npy) - output_file.parent.mkdir(exist_ok=True, parents=True) - outputs = np.array(output_ids.cpu().contiguous(), dtype='int32') - np.save(output_file, outputs) - - # Save context logits - if context_logits is not None and output_logits_npy is not None: - context_logits = torch.cat(context_logits, axis=0) - vocab_size_padded = context_logits.shape[-1] - context_logits = context_logits.reshape([1, -1, vocab_size_padded]) - - output_context_logits_npy = output_logits_npy.split( - '.npy')[0] + "_context" - output_context_logits_file = Path(output_context_logits_npy) - context_outputs = np.array( - context_logits.squeeze(0).cpu().contiguous(), - dtype='float32') # [promptLengthSum, vocabSize] - np.save(output_context_logits_file, context_outputs) - - # Save generation logits - if generation_logits is not None and output_logits_npy is not None and num_beams == 1: - output_generation_logits_npy = output_logits_npy.split( - '.npy')[0] + "_generation" - output_generation_logits_file = Path(output_generation_logits_npy) - generation_outputs = np.array(generation_logits.cpu().contiguous(), - dtype='float32') - np.save(output_generation_logits_file, generation_outputs) - - # Save cum log probs - if cum_log_probs is not None and output_cum_log_probs_npy is not None: - cum_log_probs_file = Path(output_cum_log_probs_npy) - cum_log_probs_outputs = np.array(cum_log_probs.cpu().contiguous(), - dtype='float32') - np.save(cum_log_probs_file, cum_log_probs_outputs) - - # Save cum log probs - if log_probs is not None and output_log_probs_npy is not None: - log_probs_file = Path(output_log_probs_npy) - log_probs_outputs = np.array(log_probs.cpu().contiguous(), - dtype='float32') - np.save(log_probs_file, log_probs_outputs) - - -def main(args): - runtime_rank = tensorrt_llm.mpi_rank() - logger.set_level(args.log_level) - - # different handling if encoder-decoder models - import os - is_enc_dec = { - name - for name in os.listdir(args.engine_dir) - if os.path.isdir(os.path.join(args.engine_dir, name)) - } == {'encoder', 'decoder'} - if is_enc_dec: - logger.warning( - "This path is an encoder-decoder model. Using different handling.") - assert not args.use_py_session, "Encoder-decoder models don't have a unified python runtime, please use its own examples/enc_dec/run.py instead." - - model_name, model_version = read_model_name( - args.engine_dir) if not is_enc_dec else ("", "") - if args.tokenizer_dir is None: - logger.warning( - "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect." - ) - args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name] - - tokenizer, pad_id, end_id = load_tokenizer(args, - tokenizer_dir=args.tokenizer_dir, - vocab_file=args.vocab_file, - model_name=model_name, - model_version=model_version, - tokenizer_type=args.tokenizer_type, - ) - - stop_words_list = None - if args.stop_words: - stop_words_list = tensorrt_llm.runtime.decode_words_list( - args.stop_words, tokenizer) - - bad_words_list = None - if args.bad_words: - bad_words_list = tensorrt_llm.runtime.decode_words_list( - args.bad_words, tokenizer) - - prompt_template = None - if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES: - prompt_template = DEFAULT_PROMPT_TEMPLATES[model_name] - batch_input_ids = parse_input(tokenizer=tokenizer, - input_text=args.input_text, - prompt_template=prompt_template, - input_file=args.input_file, - add_special_tokens=args.add_special_tokens, - max_input_length=args.max_input_length, - pad_id=pad_id, - num_prepend_vtokens=args.num_prepend_vtokens, - model_name=model_name, - model_version=model_version) - - if is_enc_dec: - encoder_input_ids = batch_input_ids - decoder_input_ids = [ - torch.tensor([pad_id], dtype=torch.int32) for _ in batch_input_ids - ] # by default decoder_start_token_id for T5 - - input_lengths = [x.size(0) for x in decoder_input_ids - ] if is_enc_dec else [x.size(0) for x in batch_input_ids] - encoder_input_lengths = [x.size(0) - for x in encoder_input_ids] if is_enc_dec else None - - if not PYTHON_BINDINGS and not args.use_py_session: - logger.warning( - "Python bindings of C++ session is unavailable, fallback to Python session." - ) - args.use_py_session = True - if args.debug_mode and not args.use_py_session: - logger.warning( - "Debug mode is not supported in C++ session for now, fallback to Python session." - ) - args.use_py_session = True - runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp - runner_kwargs = dict( - engine_dir=args.engine_dir, - lora_dir=args.lora_dir, - rank=runtime_rank, - debug_mode=args.debug_mode, - lora_ckpt_source=args.lora_ckpt_source, - gpu_weights_percent=args.gpu_weights_percent, - ) - if not args.use_py_session: - runner_kwargs.update(is_enc_dec=is_enc_dec) - if args.medusa_choices is not None: - args.medusa_choices = ast.literal_eval(args.medusa_choices) - assert args.temperature == 1.0, "Medusa should use temperature == 1.0" - assert args.num_beams == 1, "Medusa should use num_beams == 1" - runner_kwargs.update(medusa_choices=args.medusa_choices) - if not args.use_py_session: - runner_kwargs.update( - max_batch_size=len(batch_input_ids), - max_input_len=max( - encoder_input_lengths if is_enc_dec else input_lengths), - max_output_len=args.max_output_len, - max_beam_width=args.num_beams, - max_attention_window_size=args.max_attention_window_size, - sink_token_length=args.sink_token_length, - max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, - kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, - kv_cache_free_gpu_memory_fraction=args. - kv_cache_free_gpu_memory_fraction, - enable_chunked_context=args.enable_chunked_context, - ) - runner = runner_cls.from_dir(**runner_kwargs) - - with torch.no_grad(): - outputs = runner.generate( - batch_input_ids=decoder_input_ids - if is_enc_dec else batch_input_ids, - encoder_input_ids=encoder_input_ids if is_enc_dec else None, - max_new_tokens=args.max_output_len, - max_attention_window_size=args.max_attention_window_size, - sink_token_length=args.sink_token_length, - end_id=end_id, - pad_id=pad_id, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - length_penalty=args.length_penalty, - early_stopping=args.early_stopping, - repetition_penalty=args.repetition_penalty, - presence_penalty=args.presence_penalty, - frequency_penalty=args.frequency_penalty, - stop_words_list=stop_words_list, - bad_words_list=bad_words_list, - output_cum_log_probs=(args.output_cum_log_probs_npy != None), - output_log_probs=(args.output_log_probs_npy != None), - random_seed=args.random_seed, - lora_uids=args.lora_task_uids, - prompt_table=args.prompt_table_path, - prompt_tasks=args.prompt_tasks, - streaming=args.streaming, - output_sequence_lengths=True, - no_repeat_ngram_size=args.no_repeat_ngram_size, - return_dict=True, - medusa_choices=args.medusa_choices) - torch.cuda.synchronize() - - if args.streaming: - for curr_outputs in throttle_generator(outputs, - args.streaming_interval): - if runtime_rank == 0: - output_ids = curr_outputs['output_ids'] - sequence_lengths = curr_outputs['sequence_lengths'] - cum_log_probs = None - log_probs = None - if args.output_cum_log_probs_npy != None: - cum_log_probs = outputs['cum_log_probs'] - if args.output_log_probs_npy != None: - log_probs = outputs['log_probs'] - print_output( - tokenizer, - output_ids, - input_lengths, - sequence_lengths, - output_csv=args.output_csv, - output_npy=args.output_npy, - cum_log_probs=cum_log_probs, - log_probs=log_probs, - output_cum_log_probs_npy=args.output_cum_log_probs_npy, - output_log_probs_npy=args.output_log_probs_npy) - else: - if runtime_rank == 0: - output_ids = outputs['output_ids'] - sequence_lengths = outputs['sequence_lengths'] - context_logits = None - generation_logits = None - cum_log_probs = None - log_probs = None - if runner.gather_context_logits: - context_logits = outputs['context_logits'] - if runner.gather_generation_logits: - generation_logits = outputs['generation_logits'] - if args.output_cum_log_probs_npy != None: - cum_log_probs = outputs['cum_log_probs'] - if args.output_log_probs_npy != None: - log_probs = outputs['log_probs'] - print_output(tokenizer, - output_ids, - input_lengths, - sequence_lengths, - output_csv=args.output_csv, - output_npy=args.output_npy, - context_logits=context_logits, - generation_logits=generation_logits, - output_logits_npy=args.output_logits_npy, - cum_log_probs=cum_log_probs, - log_probs=log_probs, - output_cum_log_probs_npy=args.output_cum_log_probs_npy, - output_log_probs_npy=args.output_log_probs_npy) - - if args.run_profiling: - ite = 1 - # warmup - for _ in range(ite): - with torch.no_grad(): - outputs = runner.generate( - batch_input_ids, - max_new_tokens=args.max_output_len, - max_attention_window_size=args.max_attention_window_size, - end_id=end_id, - pad_id=pad_id, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - length_penalty=args.length_penalty, - early_stopping=args.early_stopping, - repetition_penalty=args.repetition_penalty, - presence_penalty=args.presence_penalty, - frequency_penalty=args.frequency_penalty, - stop_words_list=stop_words_list, - bad_words_list=bad_words_list, - output_cum_log_probs=(args.output_cum_log_probs_npy != - None), - output_log_probs=(args.output_log_probs_npy != None), - random_seed=args.random_seed, - lora_uids=args.lora_task_uids, - prompt_table=args.prompt_table_path, - prompt_tasks=args.prompt_tasks, - streaming=args.streaming, - output_sequence_lengths=True, - return_dict=True) - torch.cuda.synchronize() - - # tensorrt_llm.profiler.start("tmp") - for _ in range(ite): - with torch.no_grad(): - power_profile.start() - outputs = runner.generate( - batch_input_ids, - max_new_tokens=args.max_output_len, - max_attention_window_size=args.max_attention_window_size, - end_id=end_id, - pad_id=pad_id, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - length_penalty=args.length_penalty, - early_stopping=args.early_stopping, - repetition_penalty=args.repetition_penalty, - presence_penalty=args.presence_penalty, - frequency_penalty=args.frequency_penalty, - stop_words_list=stop_words_list, - bad_words_list=bad_words_list, - output_cum_log_probs=(args.output_cum_log_probs_npy != - None), - output_log_probs=(args.output_log_probs_npy != None), - random_seed=args.random_seed, - lora_uids=args.lora_task_uids, - prompt_table=args.prompt_table_path, - prompt_tasks=args.prompt_tasks, - streaming=args.streaming, - output_sequence_lengths=True, - return_dict=True) - training_powers, training_powers_time = power_profile.stop() - power_profile.destroy() - torch.cuda.synchronize() - - list_1 = ["Hardware", - "Num of Hardware", - "Framework", - "Model", - "Input Output Length", - "Batch Size", - "training_powers", - "training_powers_time" - ] - - list_2 = ["Nvidia A100 GPU", - args.tp_size, - "TensorRT-LLM", - args.model_name, - args.max_input_length, - args.batch_size, - list(training_powers), - list(training_powers_time) - ] - - assert len(list_1) == len(list_2) - - import csv, os - - def split_string(model_name): - if "/" in model_name: - return model_name.split("/")[-1] - else: - return model_name - # csv_file = "power_results.csv" - csv_file = "power_" + str(split_string(args.model_name)) + ".csv" - file_exists = os.path.exists(csv_file) - - if runtime_rank == 0: - with open(csv_file, 'a', newline = '') as csvfile: - writer = csv.writer(csvfile) - - if not file_exists: - writer.writerow(list_1) - - writer.writerow(list_2) - - csvfile.close() - - # tensorrt_llm.profiler.stop("tmp") - - # print( - # f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec" - # ) - - # file_path = "/home/krishnat95/llama-bench/Inference/TensorRT-LLM/examples/batch_input.csv" - - # if runtime_rank == 0: - # with open(file_path, 'a', newline='') as file: - # writer = csv.writer(file) - # latency = tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite - # throughput = (args.batch_size*(args.max_input_length + args.max_output_len))/latency - # data = [[args.model_name, str(args.tp_size), str(args.pp_size), str(args.moe_ep_size), str(args.moe_tp_size), str(args.precision), str(args.max_input_length), str(args.max_output_len), str(args.batch_size), str(args.int8_kv_cache), str(latency), str(throughput)]] - # writer.writerows(data) - - - - -import random -import string - - -def generate_random_word(length): - letters = string.ascii_letters - return ''.join(random.choice(letters) for i in range(length)) - -def generate_input(args): - random_words = ["France" for _ in range(args.max_input_length)] - - input_id = "" - - for word in random_words: - input_id = input_id + word + " " - - input_id = input_id[:-1] - - input_list = [] - - for batch_size in range(args.batch_size): - input_list.append(input_id) - - return input_list - - -if __name__ == '__main__': - args = parse_arguments() - args.input_text = generate_input(args) - main(args) - - - - - -power_profile.start() - -training_powers, training_powers_time = power_profile.stop() -power_profile.destroy() - diff --git a/TensorRT-LLM/H100/run_precision_bench.py b/TensorRT-LLM/H100/run_precision_bench.py deleted file mode 100644 index c8bc0f4..0000000 --- a/TensorRT-LLM/H100/run_precision_bench.py +++ /dev/null @@ -1,563 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import ast -import csv -import os -from pathlib import Path - -import numpy as np -import torch -from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, - add_common_args, load_tokenizer, read_model_name, - throttle_generator) - -import tensorrt_llm -import tensorrt_llm.profiler -from tensorrt_llm.logger import logger -from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner - -if PYTHON_BINDINGS: - from tensorrt_llm.runtime import ModelRunnerCpp - - -def parse_arguments(args=None): - parser = argparse.ArgumentParser() - - parser.add_argument('--pp_size', type=int, default = 1) - parser.add_argument('--tp_size', type=int, default = 1) - parser.add_argument('--moe_ep_size', type=int, default = 1) - parser.add_argument('--moe_tp_size', type=int, default = 1) - parser.add_argument('--model_name', type=str, required=True) - parser.add_argument('--batch_size', type=int, default=1, help='Batch Size') - parser.add_argument('--precision', type=str, default="float16", help="precision") - parser.add_argument('--int8_kv_cache', default=False, action='store_true', help="Int8 KV Cache.") - - parser.add_argument('--qformat', type=str, default="float16", help="precision") - parser.add_argument('--kv_cache_dtype', type=str, default="float16", help="precision") - - parser.add_argument('--max_input_length', type=int, default=923) - parser.add_argument('--max_output_len', type=int, required=True) - parser.add_argument( - '--input_text', - type=str, - nargs='+', - default=["Born in north-east France, Soyer trained as a"]) - parser.add_argument( - '--input_file', - type=str, - help= - 'CSV or Numpy file containing tokenized input. Alternative to text input.', - default=None) - parser.add_argument('--output_csv', - type=str, - help='CSV file where the tokenized output is stored.', - default=None) - parser.add_argument('--output_npy', - type=str, - help='Numpy file where the tokenized output is stored.', - default=None) - parser.add_argument( - '--output_logits_npy', - type=str, - help= - 'Numpy file where the generation logits are stored. Use only when num_beams==1', - default=None) - parser.add_argument('--output_log_probs_npy', - type=str, - help='Numpy file where the log_probs are stored', - default=None) - parser.add_argument('--output_cum_log_probs_npy', - type=str, - help='Numpy file where the cum_log_probs are stored', - default=None) - parser.add_argument( - '--run_profiling', - default=False, - action='store_true', - help="Run several 10 iterations to profile the inference latencies.") - parser = add_common_args(parser) - - return parser.parse_args(args=args) - - -def parse_input(tokenizer, - input_text=None, - prompt_template=None, - input_file=None, - add_special_tokens=True, - max_input_length=923, - pad_id=None, - num_prepend_vtokens=[], - model_name=None, - model_version=None): - if pad_id is None: - pad_id = tokenizer.pad_token_id - - batch_input_ids = [] - if input_file is None: - for curr_text in input_text: - if prompt_template is not None: - curr_text = prompt_template.format(input_text=curr_text) - input_ids = tokenizer.encode(curr_text, - add_special_tokens=add_special_tokens, - truncation=True, - max_length=max_input_length) - batch_input_ids.append(input_ids) - else: - if input_file.endswith('.csv'): - with open(input_file, 'r') as csv_file: - csv_reader = csv.reader(csv_file, delimiter=',') - for line in csv_reader: - input_ids = np.array(line, dtype='int32') - batch_input_ids.append(input_ids[-max_input_length:]) - elif input_file.endswith('.npy'): - inputs = np.load(input_file) - for row in inputs: - input_ids = row[row != pad_id] - batch_input_ids.append(input_ids[-max_input_length:]) - elif input_file.endswith('.txt'): - with open(input_file, 'r', encoding='utf-8', - errors='replace') as txt_file: - input_text = txt_file.readlines() - batch_input_ids = tokenizer( - input_text, - add_special_tokens=add_special_tokens, - truncation=True, - max_length=max_input_length)["input_ids"] - else: - print('Input file format not supported.') - raise SystemExit - - if num_prepend_vtokens: - assert len(num_prepend_vtokens) == len(batch_input_ids) - base_vocab_size = tokenizer.vocab_size - len( - tokenizer.special_tokens_map.get('additional_special_tokens', [])) - for i, length in enumerate(num_prepend_vtokens): - batch_input_ids[i] = list( - range(base_vocab_size, - base_vocab_size + length)) + batch_input_ids[i] - - if model_name == 'ChatGLMForCausalLM' and model_version == 'glm': - for ids in batch_input_ids: - ids.append(tokenizer.sop_token_id) - - batch_input_ids = [ - torch.tensor(x, dtype=torch.int32) for x in batch_input_ids - ] - return batch_input_ids - - -def print_output(tokenizer, - output_ids, - input_lengths, - sequence_lengths, - output_csv=None, - output_npy=None, - context_logits=None, - generation_logits=None, - cum_log_probs=None, - log_probs=None, - output_logits_npy=None, - output_cum_log_probs_npy=None, - output_log_probs_npy=None): - batch_size, num_beams, _ = output_ids.size() - if output_csv is None and output_npy is None: - for batch_idx in range(batch_size): - inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist( - ) - input_text = tokenizer.decode(inputs) - print(f'Input [Text {batch_idx}]: \"{input_text}\"') - for beam in range(num_beams): - output_begin = input_lengths[batch_idx] - output_end = sequence_lengths[batch_idx][beam] - outputs = output_ids[batch_idx][beam][ - output_begin:output_end].tolist() - output_text = tokenizer.decode(outputs) - print( - f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"') - - output_ids = output_ids.reshape((-1, output_ids.size(2))) - if output_csv is not None: - output_file = Path(output_csv) - output_file.parent.mkdir(exist_ok=True, parents=True) - outputs = output_ids.tolist() - with open(output_file, 'w') as csv_file: - writer = csv.writer(csv_file, delimiter=',') - writer.writerows(outputs) - - if output_npy is not None: - output_file = Path(output_npy) - output_file.parent.mkdir(exist_ok=True, parents=True) - outputs = np.array(output_ids.cpu().contiguous(), dtype='int32') - np.save(output_file, outputs) - - # Save context logits - if context_logits is not None and output_logits_npy is not None: - context_logits = torch.cat(context_logits, axis=0) - vocab_size_padded = context_logits.shape[-1] - context_logits = context_logits.reshape([1, -1, vocab_size_padded]) - - output_context_logits_npy = output_logits_npy.split( - '.npy')[0] + "_context" - output_context_logits_file = Path(output_context_logits_npy) - context_outputs = np.array( - context_logits.squeeze(0).cpu().contiguous(), - dtype='float32') # [promptLengthSum, vocabSize] - np.save(output_context_logits_file, context_outputs) - - # Save generation logits - if generation_logits is not None and output_logits_npy is not None and num_beams == 1: - output_generation_logits_npy = output_logits_npy.split( - '.npy')[0] + "_generation" - output_generation_logits_file = Path(output_generation_logits_npy) - generation_outputs = np.array(generation_logits.cpu().contiguous(), - dtype='float32') - np.save(output_generation_logits_file, generation_outputs) - - # Save cum log probs - if cum_log_probs is not None and output_cum_log_probs_npy is not None: - cum_log_probs_file = Path(output_cum_log_probs_npy) - cum_log_probs_outputs = np.array(cum_log_probs.cpu().contiguous(), - dtype='float32') - np.save(cum_log_probs_file, cum_log_probs_outputs) - - # Save cum log probs - if log_probs is not None and output_log_probs_npy is not None: - log_probs_file = Path(output_log_probs_npy) - log_probs_outputs = np.array(log_probs.cpu().contiguous(), - dtype='float32') - np.save(log_probs_file, log_probs_outputs) - - -def main(args): - runtime_rank = tensorrt_llm.mpi_rank() - logger.set_level(args.log_level) - - # different handling if encoder-decoder models - is_enc_dec = { - name - for name in os.listdir(args.engine_dir) - if os.path.isdir(os.path.join(args.engine_dir, name)) - } == {'encoder', 'decoder'} - if is_enc_dec: - logger.warning( - "This path is an encoder-decoder model. Using different handling.") - assert not args.use_py_session, "Encoder-decoder models don't have a unified python runtime, please use its own examples/enc_dec/run.py instead." - - model_name, model_version = read_model_name( - args.engine_dir) if not is_enc_dec else ("", "") - if args.tokenizer_dir is None: - logger.warning( - "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect." - ) - args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name] - - tokenizer, pad_id, end_id = load_tokenizer(args, - tokenizer_dir=args.tokenizer_dir, - vocab_file=args.vocab_file, - model_name=model_name, - model_version=model_version, - tokenizer_type=args.tokenizer_type, - ) - - stop_words_list = None - if args.stop_words: - stop_words_list = tensorrt_llm.runtime.decode_words_list( - args.stop_words, tokenizer) - - bad_words_list = None - if args.bad_words: - bad_words_list = tensorrt_llm.runtime.decode_words_list( - args.bad_words, tokenizer) - - prompt_template = None - if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES: - prompt_template = DEFAULT_PROMPT_TEMPLATES[model_name] - batch_input_ids = parse_input(tokenizer=tokenizer, - input_text=args.input_text, - prompt_template=prompt_template, - input_file=args.input_file, - add_special_tokens=args.add_special_tokens, - max_input_length=args.max_input_length, - pad_id=pad_id, - num_prepend_vtokens=args.num_prepend_vtokens, - model_name=model_name, - model_version=model_version) - - if is_enc_dec: - encoder_input_ids = batch_input_ids - decoder_input_ids = [ - torch.tensor([pad_id], dtype=torch.int32) for _ in batch_input_ids - ] # by default decoder_start_token_id for T5 - - input_lengths = [x.size(0) for x in decoder_input_ids - ] if is_enc_dec else [x.size(0) for x in batch_input_ids] - encoder_input_lengths = [x.size(0) - for x in encoder_input_ids] if is_enc_dec else None - - if not PYTHON_BINDINGS and not args.use_py_session: - logger.warning( - "Python bindings of C++ session is unavailable, fallback to Python session." - ) - args.use_py_session = True - if args.debug_mode and not args.use_py_session: - logger.warning( - "Debug mode is not supported in C++ session for now, fallback to Python session." - ) - args.use_py_session = True - runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp - runner_kwargs = dict( - engine_dir=args.engine_dir, - lora_dir=args.lora_dir, - rank=runtime_rank, - debug_mode=args.debug_mode, - lora_ckpt_source=args.lora_ckpt_source, - gpu_weights_percent=args.gpu_weights_percent, - ) - if not args.use_py_session: - runner_kwargs.update(is_enc_dec=is_enc_dec) - if args.medusa_choices is not None: - args.medusa_choices = ast.literal_eval(args.medusa_choices) - assert args.temperature == 1.0, "Medusa should use temperature == 1.0" - assert args.num_beams == 1, "Medusa should use num_beams == 1" - runner_kwargs.update(medusa_choices=args.medusa_choices) - if not args.use_py_session: - runner_kwargs.update( - max_batch_size=len(batch_input_ids), - max_input_len=max( - encoder_input_lengths if is_enc_dec else input_lengths), - max_output_len=args.max_output_len, - max_beam_width=args.num_beams, - max_attention_window_size=args.max_attention_window_size, - sink_token_length=args.sink_token_length, - max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, - kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, - kv_cache_free_gpu_memory_fraction=args. - kv_cache_free_gpu_memory_fraction, - enable_chunked_context=args.enable_chunked_context, - ) - runner = runner_cls.from_dir(**runner_kwargs) - - with torch.no_grad(): - outputs = runner.generate( - batch_input_ids=decoder_input_ids - if is_enc_dec else batch_input_ids, - encoder_input_ids=encoder_input_ids if is_enc_dec else None, - max_new_tokens=args.max_output_len, - max_attention_window_size=args.max_attention_window_size, - sink_token_length=args.sink_token_length, - end_id=end_id, - pad_id=pad_id, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - length_penalty=args.length_penalty, - early_stopping=args.early_stopping, - repetition_penalty=args.repetition_penalty, - presence_penalty=args.presence_penalty, - frequency_penalty=args.frequency_penalty, - stop_words_list=stop_words_list, - bad_words_list=bad_words_list, - output_cum_log_probs=(args.output_cum_log_probs_npy != None), - output_log_probs=(args.output_log_probs_npy != None), - random_seed=args.random_seed, - lora_uids=args.lora_task_uids, - prompt_table=args.prompt_table_path, - prompt_tasks=args.prompt_tasks, - streaming=args.streaming, - output_sequence_lengths=True, - no_repeat_ngram_size=args.no_repeat_ngram_size, - return_dict=True, - medusa_choices=args.medusa_choices) - torch.cuda.synchronize() - - if args.streaming: - for curr_outputs in throttle_generator(outputs, - args.streaming_interval): - if runtime_rank == 0: - output_ids = curr_outputs['output_ids'] - sequence_lengths = curr_outputs['sequence_lengths'] - cum_log_probs = None - log_probs = None - if args.output_cum_log_probs_npy != None: - cum_log_probs = outputs['cum_log_probs'] - if args.output_log_probs_npy != None: - log_probs = outputs['log_probs'] - print_output( - tokenizer, - output_ids, - input_lengths, - sequence_lengths, - output_csv=args.output_csv, - output_npy=args.output_npy, - cum_log_probs=cum_log_probs, - log_probs=log_probs, - output_cum_log_probs_npy=args.output_cum_log_probs_npy, - output_log_probs_npy=args.output_log_probs_npy) - else: - if runtime_rank == 0: - output_ids = outputs['output_ids'] - sequence_lengths = outputs['sequence_lengths'] - context_logits = None - generation_logits = None - cum_log_probs = None - log_probs = None - if runner.gather_context_logits: - context_logits = outputs['context_logits'] - if runner.gather_generation_logits: - generation_logits = outputs['generation_logits'] - if args.output_cum_log_probs_npy != None: - cum_log_probs = outputs['cum_log_probs'] - if args.output_log_probs_npy != None: - log_probs = outputs['log_probs'] - print_output(tokenizer, - output_ids, - input_lengths, - sequence_lengths, - output_csv=args.output_csv, - output_npy=args.output_npy, - context_logits=context_logits, - generation_logits=generation_logits, - output_logits_npy=args.output_logits_npy, - cum_log_probs=cum_log_probs, - log_probs=log_probs, - output_cum_log_probs_npy=args.output_cum_log_probs_npy, - output_log_probs_npy=args.output_log_probs_npy) - - if args.run_profiling: - ite = 10 - # warmup - for _ in range(ite): - with torch.no_grad(): - outputs = runner.generate( - batch_input_ids, - max_new_tokens=args.max_output_len, - max_attention_window_size=args.max_attention_window_size, - end_id=end_id, - pad_id=pad_id, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - length_penalty=args.length_penalty, - early_stopping=args.early_stopping, - repetition_penalty=args.repetition_penalty, - presence_penalty=args.presence_penalty, - frequency_penalty=args.frequency_penalty, - stop_words_list=stop_words_list, - bad_words_list=bad_words_list, - output_cum_log_probs=(args.output_cum_log_probs_npy != - None), - output_log_probs=(args.output_log_probs_npy != None), - random_seed=args.random_seed, - lora_uids=args.lora_task_uids, - prompt_table=args.prompt_table_path, - prompt_tasks=args.prompt_tasks, - streaming=args.streaming, - output_sequence_lengths=True, - return_dict=True) - torch.cuda.synchronize() - - tensorrt_llm.profiler.start("tmp") - ite = 1 - for _ in range(ite): - with torch.no_grad(): - outputs = runner.generate( - batch_input_ids, - max_new_tokens=args.max_output_len, - max_attention_window_size=args.max_attention_window_size, - end_id=end_id, - pad_id=pad_id, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - length_penalty=args.length_penalty, - early_stopping=args.early_stopping, - repetition_penalty=args.repetition_penalty, - presence_penalty=args.presence_penalty, - frequency_penalty=args.frequency_penalty, - stop_words_list=stop_words_list, - bad_words_list=bad_words_list, - output_cum_log_probs=(args.output_cum_log_probs_npy != - None), - output_log_probs=(args.output_log_probs_npy != None), - random_seed=args.random_seed, - lora_uids=args.lora_task_uids, - prompt_table=args.prompt_table_path, - prompt_tasks=args.prompt_tasks, - streaming=args.streaming, - output_sequence_lengths=True, - return_dict=True) - torch.cuda.synchronize() - tensorrt_llm.profiler.stop("tmp") - - print( - f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec" - ) - - def split_string(model_name): - if "/" in model_name: - return model_name.split("/")[-1] - else: - return model_name - - file_path = "precision_results_" + str(split_string(args.model_name)) + ".csv" - - if runtime_rank == 0: - - latency = tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite - throughput = (args.batch_size*(args.max_input_length + args.max_output_len))/latency - Weight_KV_dtype = f"Weight = {args.qformat}, KV Cache = {args.kv_cache_dtype}" - data = [["Nvidia A100 GPU",str(args.tp_size),"TensorRT-LLM",args.model_name,str(args.max_input_length),str(args.batch_size),Weight_KV_dtype,str(latency),str(throughput)]] - - with open(file_path, 'a', newline='') as file: - writer = csv.writer(file) - writer.writerows(data) - - -import random -import string - - -def generate_random_word(length): - letters = string.ascii_letters - return ''.join(random.choice(letters) for i in range(length)) - -def generate_input(args): - random_words = ["France" for _ in range(args.max_input_length)] - - input_id = "" - - for word in random_words: - input_id = input_id + word + " " - - input_id = input_id[:-1] - - input_list = [] - - for batch_size in range(args.batch_size): - input_list.append(input_id) - - return input_list - - -if __name__ == '__main__': - args = parse_arguments() - args.input_text = generate_input(args) - main(args) diff --git a/TensorRT-LLM/H100/utils.py b/TensorRT-LLM/H100/utils.py deleted file mode 100644 index bf0057f..0000000 --- a/TensorRT-LLM/H100/utils.py +++ /dev/null @@ -1,373 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import json -from pathlib import Path -from typing import Optional - -from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer - -from tensorrt_llm.bindings import GptJsonConfig -from tensorrt_llm.builder import get_engine_version - -DEFAULT_HF_MODEL_DIRS = { - 'BaichuanForCausalLM': 'baichuan-inc/Baichuan-13B-Chat', - 'BloomForCausalLM': 'bigscience/bloom-560m', - 'GLMModel': 'THUDM/glm-10b', - 'ChatGLMModel': 'THUDM/chatglm3-6b', - 'ChatGLMForCausalLM': 'THUDM/chatglm3-6b', - 'FalconForCausalLM': 'tiiuae/falcon-rw-1b', - 'GPTForCausalLM': 'gpt2-medium', - 'GPTJForCausalLM': 'EleutherAI/gpt-j-6b', - 'GPTNeoXForCausalLM': 'EleutherAI/gpt-neox-20b', - 'InternLMForCausalLM': 'internlm/internlm-chat-7b', - 'InternLM2ForCausalLM': 'internlm/internlm2-chat-7b', - 'LlamaForCausalLM': 'meta-llama/Llama-2-7b-hf', - 'MPTForCausalLM': 'mosaicml/mpt-7b', - 'PhiForCausalLM': 'microsoft/phi-2', - 'OPTForCausalLM': 'facebook/opt-350m', - 'QWenLMHeadModel': 'Qwen/Qwen-7B', - 'QWenForCausalLM': 'Qwen/Qwen-7B', - 'Qwen2ForCausalLM': 'Qwen/Qwen1.5-7B', - 'Qwen2MoeForCausalLM': 'Qwen/Qwen1.5-MoE-A2.7B', - 'RecurrentGemmaForCausalLM': 'google/recurrentgemma-2b', -} - -INTERNLM_META_INSTRUCTION = """You are an AI assistant whose name is InternLM (书生·浦语). -- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless. -- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文. -""" - -QWEN_PROMPT_TEMPLATE = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n" - -DEFAULT_PROMPT_TEMPLATES = { - 'InternLMForCausalLM': "<|User|>:{input_text}\n<|Bot|>:", - 'InternLM2ForCausalLM': "<|im_start|>system\n" + INTERNLM_META_INSTRUCTION + - "<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n", - 'QWenLMHeadModel': QWEN_PROMPT_TEMPLATE, - 'QWenForCausalLM': QWEN_PROMPT_TEMPLATE, - 'Qwen2ForCausalLM': QWEN_PROMPT_TEMPLATE, - 'Qwen2MoeForCausalLM': QWEN_PROMPT_TEMPLATE, -} - - -def supports_inflight_batching(engine_dir): - config_path = Path(engine_dir) / "config.json" - json_config = GptJsonConfig.parse_file(config_path) - model_config = json_config.model_config - return model_config.supports_inflight_batching - - -def read_decoder_start_token_id(engine_dir): - with open(Path(engine_dir) / "config.json", 'r') as f: - config = json.load(f) - return config['pretrained_config']['decoder_start_token_id'] - - -def read_model_name(engine_dir: str): - engine_version = get_engine_version(engine_dir) - - with open(Path(engine_dir) / "config.json", 'r') as f: - config = json.load(f) - - if engine_version is None: - return config['builder_config']['name'], None - - model_arch = config['pretrained_config']['architecture'] - model_version = None - if 'GLM' in model_arch: - model_version = config['pretrained_config']['chatglm_version'] - if 'qwen' in model_arch.lower(): - model_version = config['pretrained_config']['qwen_type'] - return model_arch, model_version - - -def throttle_generator(generator, stream_interval): - for i, out in enumerate(generator): - if not i % stream_interval: - yield out - - if i % stream_interval: - yield out - - -def load_tokenizer(args, tokenizer_dir: Optional[str] = None, - vocab_file: Optional[str] = None, - model_name: str = 'GPTForCausalLM', - model_version: Optional[str] = None, - tokenizer_type: Optional[str] = None): - if vocab_file is None: - use_fast = True - if tokenizer_type is not None and tokenizer_type == "llama": - use_fast = False - # Should set both padding_side and truncation_side to be 'left' - if "Llama-3-8B" in args.model_name: - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", - cache_dir = "/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6", - legacy=False, - padding_side='left', - truncation_side='left', - trust_remote_code=True, - use_fast=False - ) - - else: - tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - legacy=False, - padding_side='left', - truncation_side='left', - trust_remote_code=True, - tokenizer_type=tokenizer_type, - use_fast=use_fast) - elif model_name == 'GemmaForCausalLM' or model_name == 'RecurrentGemmaForCausalLM': - from transformers import GemmaTokenizer - - # Initialize tokenizer from vocab file. - tokenizer = GemmaTokenizer(vocab_file=vocab_file, - padding_side='left', - truncation_side='left', - legacy=False) - elif model_name == 'Grok1ModelForCausalLM': - tokenizer = LlamaTokenizer(vocab_file=vocab_file, - padding_side='left', - truncation_side='left', - legacy=False, - use_fast=False) - else: - # For gpt-next, directly load from tokenizer.model - tokenizer = T5Tokenizer(vocab_file=vocab_file, - padding_side='left', - truncation_side='left', - legacy=False) - if 'qwen' in model_name.lower() and model_version == 'qwen': - with open(Path(tokenizer_dir) / "generation_config.json") as f: - gen_config = json.load(f) - pad_id = gen_config['pad_token_id'] - end_id = gen_config['eos_token_id'] - elif 'GLM' in model_name and model_version == 'glm': - pad_id = tokenizer.pad_token_id - end_id = tokenizer.eop_token_id - else: - if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = tokenizer.eos_token_id - pad_id = tokenizer.pad_token_id - end_id = tokenizer.eos_token_id - - return tokenizer, pad_id, end_id - - -def add_common_args(parser): - # sampling arguments - parser.add_argument('--num_beams', - type=int, - help="Use beam search if num_beams > 1", - default=1) - parser.add_argument('--temperature', type=float, default=1.0) - parser.add_argument('--top_k', type=int, default=1) - parser.add_argument('--top_p', type=float, default=0.0) - parser.add_argument('--length_penalty', type=float, default=1.0) - parser.add_argument('--repetition_penalty', type=float, default=1.0) - parser.add_argument('--presence_penalty', type=float, default=0.0) - parser.add_argument('--frequency_penalty', type=float, default=0.0) - parser.add_argument('--beam_search_diversity_rate', type=float, default=0.0) - parser.add_argument('--random_seed', type=int, default=0) - parser.add_argument('--early_stopping', - type=int, - help='Use early stopping if num_beams > 1' - '1 for early-stopping, 0 for non-early-stopping' - 'other values for stopping by length', - default=1) - parser.add_argument( - '--end_id', - default=None, - type=int, - help="Override tokenizer end_id to stop on given end_id token.") - parser.add_argument( - '--stop_words', - default=None, - type=str, - nargs="+", - action='append', - help= - 'Set stop words for a batch. Successive invocations of --stop_words set stop words for other batches.' - ' E.g.: --stop_words " London" " chef" --stop_words "eventually became" "was not"', - ) - parser.add_argument( - '--bad_words', - default=None, - type=str, - nargs="+", - action='append', - help= - 'Set bad words for a batch. Successive invocations of --bad_words set bad words for other batches.' - ' E.g.: --bad_words " London" " chef" --bad_words "eventually became" "was not"', - ) - parser.add_argument('--no_repeat_ngram_size', type=int, default=None) - - # common runtime arguments - parser.add_argument('--sink_token_length', - type=int, - default=None, - help='The sink token length.') - parser.add_argument( - '--max_attention_window_size', - type=int, - default=None, - help= - 'The attention window size that controls the sliding window attention / cyclic kv cache behavior' - ) - parser.add_argument( - '--multi_block_mode', - action='store_true', - help= - "Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel." - ) - parser.add_argument('--log_level', type=str, default='info') - parser.add_argument( - '--no_prompt_template', - dest='use_prompt_template', - default=True, - action='store_false', - help= - "Whether or not to use default prompt template to wrap the input text.") - parser.add_argument('--use_py_session', - default=False, - action='store_true', - help="Whether or not to use Python runtime session") - parser.add_argument('--debug_mode', - default=False, - action='store_true', - help="Whether or not to turn on the debug mode") - parser.add_argument('--streaming', default=False, action='store_true') - parser.add_argument('--streaming_interval', - type=int, - help="How often to return tokens when streaming.", - default=5) - parser.add_argument( - '--prompt_table_path', - type=str, - help="Path to .npy file, exported by nemo_prompt_convert.py") - parser.add_argument( - '--prompt_tasks', - help="Comma-separated list of tasks for prompt tuning, e.g., 0,3,1,0") - parser.add_argument('--lora_dir', - type=str, - default=None, - nargs="+", - help="The directory of LoRA weights") - parser.add_argument('--lora_ckpt_source', - type=str, - default="hf", - choices=["hf", "nemo"], - help="The source of lora checkpoint.") - parser.add_argument( - '--lora_task_uids', - type=str, - default=None, - nargs="+", - help="The list of LoRA task uids; use -1 to disable the LoRA module") - parser.add_argument( - '--num_prepend_vtokens', - nargs="+", - type=int, - help="Number of (default) virtual tokens to prepend to each sentence." - " For example, '--num_prepend_vtokens=10' will prepend the tokens" - " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.") - parser.add_argument( - '--medusa_choices', - type=str, - default=None, - help="Medusa choice to use, if not none, will use Medusa decoding." - " E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens." - ) - - # model arguments - parser.add_argument('--engine_dir', type=str, default='engine_outputs') - parser.add_argument( - '--tokenizer_type', - help= - 'Specify that argument when providing a .model file as the tokenizer_dir. ' - 'It allows AutoTokenizer to instantiate the correct tokenizer type.') - parser.add_argument('--vocab_file', - help="Used for sentencepiece tokenizers") - parser.add_argument('--no_add_special_tokens', - dest='add_special_tokens', - default=True, - action='store_false', - help="Whether or not to add special tokens") - parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None) - parser.add_argument( - '--tokenizer_dir', - default=None, - help='tokenizer path; defaults to hf_model_dir if left unspecified') - - # memory argument - parser.add_argument( - '--gpu_weights_percent', - default=1, - type=float, - help= - 'Specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime.', - ) - parser.add_argument( - '--max_tokens_in_paged_kv_cache', - default=None, - type=int, - help= - 'Specify the maximum number of tokens in a kv cache page (only available with cpp session).', - ) - parser.add_argument( - '--kv_cache_enable_block_reuse', - action='store_true', - help= - 'Enables block reuse in kv cache (only available with cpp session).', - ) - parser.add_argument( - '--kv_cache_free_gpu_memory_fraction', - default=0.9, - type=float, - help='Specify the free gpu memory fraction.', - ) - parser.add_argument( - '--enable_chunked_context', - action='store_true', - help='Enables chunked context (only available with cpp session).', - ) - - # hf model argument (if use hf model) - parser.add_argument( - '--hf_data_type', - '--data_type', - type=str, - choices=['fp32', 'fp16', 'bf16', 'float32', 'float16', 'bfloat16'], - default='fp16', - help="The data type for hf model.") - parser.add_argument( - '--hf_device_map_auto', - action='store_true', - help="Use device map 'auto' to load a pretrained HF model. This may " - "help to test a large model that cannot fit into a singlue GPU.") - - parser.add_argument( - "--return_all_generated_tokens", - default=False, - action="store_true", - help="This option changes the token output only for streaming. " - "If not specified, return only generated tokens at each step. " - "If specified, return the full beams/outputs at each step. " - "It is automatically enabled for num_beams>1 (only available with cpp session). " - "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)." - ) - - return parser diff --git a/TensorRT-LLM/README.md b/TensorRT-LLM/README.md deleted file mode 100644 index 7ed0f40..0000000 --- a/TensorRT-LLM/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# TensorRT-LLM -TensorRT-LLM diff --git a/InferenceGraphPlotter/data/.DS_Store b/data/.DS_Store similarity index 100% rename from InferenceGraphPlotter/data/.DS_Store rename to data/.DS_Store diff --git a/InferenceGraphPlotter/data/Best_Hardware_and_Framework/All_results.csv b/data/Best_Hardware_and_Framework/All_results.csv similarity index 100% rename from InferenceGraphPlotter/data/Best_Hardware_and_Framework/All_results.csv rename to data/Best_Hardware_and_Framework/All_results.csv diff --git a/InferenceGraphPlotter/data/Best_Hardware_and_Framework/config.json b/data/Best_Hardware_and_Framework/config.json similarity index 100% rename from InferenceGraphPlotter/data/Best_Hardware_and_Framework/config.json rename to data/Best_Hardware_and_Framework/config.json diff --git a/InferenceGraphPlotter/data/KV_Cache/All_results.csv b/data/KV_Cache/All_results.csv similarity index 100% rename from InferenceGraphPlotter/data/KV_Cache/All_results.csv rename to data/KV_Cache/All_results.csv diff --git a/InferenceGraphPlotter/data/KV_Cache/config.json b/data/KV_Cache/config.json similarity index 100% rename from InferenceGraphPlotter/data/KV_Cache/config.json rename to data/KV_Cache/config.json diff --git a/InferenceGraphPlotter/data/KV_Cache_Block_Size/All_results.csv b/data/KV_Cache_Block_Size/All_results.csv similarity index 100% rename from InferenceGraphPlotter/data/KV_Cache_Block_Size/All_results.csv rename to data/KV_Cache_Block_Size/All_results.csv diff --git a/InferenceGraphPlotter/data/KV_Cache_Block_Size/config.json b/data/KV_Cache_Block_Size/config.json similarity index 100% rename from InferenceGraphPlotter/data/KV_Cache_Block_Size/config.json rename to data/KV_Cache_Block_Size/config.json diff --git a/InferenceGraphPlotter/data/KV_Cache_Dtype/All_results.csv b/data/KV_Cache_Dtype/All_results.csv similarity index 100% rename from InferenceGraphPlotter/data/KV_Cache_Dtype/All_results.csv rename to data/KV_Cache_Dtype/All_results.csv diff --git a/InferenceGraphPlotter/data/KV_Cache_Dtype/config.json b/data/KV_Cache_Dtype/config.json similarity index 100% rename from InferenceGraphPlotter/data/KV_Cache_Dtype/config.json rename to data/KV_Cache_Dtype/config.json diff --git a/InferenceGraphPlotter/data/Parallelism/All_results.csv b/data/Parallelism/All_results.csv similarity index 100% rename from InferenceGraphPlotter/data/Parallelism/All_results.csv rename to data/Parallelism/All_results.csv diff --git a/InferenceGraphPlotter/data/Parallelism/config.json b/data/Parallelism/config.json similarity index 100% rename from InferenceGraphPlotter/data/Parallelism/config.json rename to data/Parallelism/config.json diff --git a/InferenceGraphPlotter/data/Perplexity/All_results.csv b/data/Perplexity/All_results.csv similarity index 100% rename from InferenceGraphPlotter/data/Perplexity/All_results.csv rename to data/Perplexity/All_results.csv diff --git a/InferenceGraphPlotter/data/Perplexity/config.json b/data/Perplexity/config.json similarity index 100% rename from InferenceGraphPlotter/data/Perplexity/config.json rename to data/Perplexity/config.json diff --git a/InferenceGraphPlotter/data/Power/All_results.csv b/data/Power/All_results.csv similarity index 100% rename from InferenceGraphPlotter/data/Power/All_results.csv rename to data/Power/All_results.csv diff --git a/InferenceGraphPlotter/data/Power/config.json b/data/Power/config.json similarity index 100% rename from InferenceGraphPlotter/data/Power/config.json rename to data/Power/config.json diff --git a/InferenceGraphPlotter/data/Speculative_Decoding/All_results.csv b/data/Speculative_Decoding/All_results.csv similarity index 100% rename from InferenceGraphPlotter/data/Speculative_Decoding/All_results.csv rename to data/Speculative_Decoding/All_results.csv diff --git a/InferenceGraphPlotter/data/Speculative_Decoding/config.json b/data/Speculative_Decoding/config.json similarity index 100% rename from InferenceGraphPlotter/data/Speculative_Decoding/config.json rename to data/Speculative_Decoding/config.json diff --git a/InferenceGraphPlotter/data/Throughput/All_results.csv b/data/Throughput/All_results.csv similarity index 100% rename from InferenceGraphPlotter/data/Throughput/All_results.csv rename to data/Throughput/All_results.csv diff --git a/InferenceGraphPlotter/data/Throughput/config.json b/data/Throughput/config.json similarity index 100% rename from InferenceGraphPlotter/data/Throughput/config.json rename to data/Throughput/config.json diff --git a/InferenceGraphPlotter/data/alias.json b/data/alias.json similarity index 100% rename from InferenceGraphPlotter/data/alias.json rename to data/alias.json diff --git a/InferenceGraphPlotter/data/color_coding.json b/data/color_coding.json similarity index 100% rename from InferenceGraphPlotter/data/color_coding.json rename to data/color_coding.json diff --git a/InferenceGraphPlotter/data/graphs_list.txt b/data/graphs_list.txt similarity index 100% rename from InferenceGraphPlotter/data/graphs_list.txt rename to data/graphs_list.txt diff --git a/InferenceGraphPlotter/index.html b/index.html similarity index 100% rename from InferenceGraphPlotter/index.html rename to index.html diff --git a/llama.cpp/A100/README.MD b/llama.cpp/A100/README.MD deleted file mode 100644 index e69de29..0000000 diff --git a/llama.cpp/GH200/README.MD b/llama.cpp/GH200/README.MD deleted file mode 100644 index e69de29..0000000 diff --git a/llama.cpp/H100/README.MD b/llama.cpp/H100/README.MD deleted file mode 100644 index e69de29..0000000 diff --git a/llama.cpp/MI250/README.MD b/llama.cpp/MI250/README.MD deleted file mode 100644 index e69de29..0000000 diff --git a/llama.cpp/README.md b/llama.cpp/README.md deleted file mode 100644 index 9487602..0000000 --- a/llama.cpp/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# llama.cpp -llama.cpp diff --git a/InferenceGraphPlotter/src/css/style.css b/src/css/style.css similarity index 100% rename from InferenceGraphPlotter/src/css/style.css rename to src/css/style.css diff --git a/InferenceGraphPlotter/src/js/display.js b/src/js/display.js similarity index 100% rename from InferenceGraphPlotter/src/js/display.js rename to src/js/display.js diff --git a/InferenceGraphPlotter/src/js/download_SVG.js b/src/js/download_SVG.js similarity index 100% rename from InferenceGraphPlotter/src/js/download_SVG.js rename to src/js/download_SVG.js diff --git a/vLLM/A100/README.MD b/vLLM/A100/README.MD deleted file mode 100644 index e69de29..0000000 diff --git a/vLLM/GH200/README.MD b/vLLM/GH200/README.MD deleted file mode 100644 index 3269baa..0000000 --- a/vLLM/GH200/README.MD +++ /dev/null @@ -1,15 +0,0 @@ -# Setup vLLM on GH200 - -1. Build a container -```bash -$ source build-container.sh -``` - -2. Run container. -```bash -sourece run-container.sh -``` -This will run the container and execute the `run-models.sh` script. - -3. Run models -First `run-models.sh` will install the dependacnies by calling `wheels/setup_wheel.sh` script. It will run benchmakrs following it. \ No newline at end of file diff --git a/vLLM/GH200/benchmark_latency.py b/vLLM/GH200/benchmark_latency.py deleted file mode 100644 index b5048f7..0000000 --- a/vLLM/GH200/benchmark_latency.py +++ /dev/null @@ -1,334 +0,0 @@ -"""Benchmark the latency of processing a single batch of requests.""" -import argparse -import json -import time -from pathlib import Path -from typing import List, Optional - -import numpy as np -import torch -from tqdm import tqdm - -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.inputs import PromptInputs -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.utils import FlexibleArgumentParser - -import os -import csv - -def main(args: argparse.Namespace): - print(args) - - # NOTE(woosuk): If the request cannot be processed in a single batch, - # the engine will automatically process the request in multiple batches. - llm = LLM( - model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - speculative_draft_tensor_parallel_size=\ - args.speculative_draft_tensor_parallel_size, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - max_model_len=args.max_model_len, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - use_v2_block_manager=args.use_v2_block_manager, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - load_format=args.load_format, - distributed_executor_backend=args.distributed_executor_backend, - otlp_traces_endpoint=args.otlp_traces_endpoint, - enable_prefix_caching=args.enable_prefix_caching, - ) - - sampling_params = SamplingParams( - n=args.n, - temperature=0.0 if args.use_beam_search else 1.0, - top_p=1.0, - use_beam_search=args.use_beam_search, - ignore_eos=True, - max_tokens=args.output_len, - ) - print(sampling_params) - dummy_prompt_token_ids = np.random.randint(10000, - size=(args.batch_size, - args.input_len)) - dummy_inputs: List[PromptStrictInputs] = [{ - "prompt_token_ids": batch - } for batch in dummy_prompt_token_ids.tolist()] - - def run_to_completion(profile_dir: Optional[str] = None): - if profile_dir: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir))) as p: - llm.generate(dummy_inputs, - sampling_params=sampling_params, - use_tqdm=False) - print(p.key_averages()) - else: - start_time = time.perf_counter() - llm.generate(dummy_inputs, - sampling_params=sampling_params, - use_tqdm=False) - end_time = time.perf_counter() - latency = end_time - start_time - return latency - - print("Warming up...") - for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): - run_to_completion(profile_dir=None) - - if args.profile: - profile_dir = args.profile_result_dir - if not profile_dir: - profile_dir = Path( - "." - ) / "vllm_benchmark_result" / f"latency_result_{time.time()}" - print(f"Profiling (results will be saved to '{profile_dir}')...") - run_to_completion(profile_dir=profile_dir) - return - - # Benchmark. - latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): - latencies.append(run_to_completion(profile_dir=None)) - latencies = np.array(latencies) - percentages = [10, 25, 50, 75, 90, 99] - percentiles = np.percentile(latencies, percentages) - print(f'Avg latency: {np.mean(latencies)} seconds') - for percentage, percentile in zip(percentages, percentiles): - print(f'{percentage}% percentile latency: {percentile} seconds') - # output to csv file - avg_latency = np.mean(latencies) - print(f'Avg latency: {avg_latency} seconds') - - total_num_tokens = args.batch_size*(args.input_len + args.output_len) - print("Total Number of Tokens = ", total_num_tokens) - - throughput = total_num_tokens/avg_latency - print("Throughput = ", throughput) - - list_1 = ["Model Name", - "throughput", - "latency", - "batch size", - "tensor_parallel", - "input length", - "output length" - ] - - list_2 = [args.model, - throughput, - avg_latency, - args.batch_size, - args.tensor_parallel_size, - args.input_len, - args.output_len - ] - - assert len(list_1) == len(list_2) - - def split_string(model_name): - if "/" in model_name: - return model_name.split("/")[-1] - else: - return model_name - - csv_file = "results_" + str(split_string(args.model)) + ".csv" - file_exists = os.path.exists(csv_file) - - with open(csv_file, 'a', newline = '') as csvfile: - writer = csv.writer(csvfile) - - if not file_exists: - writer.writerow(list_1) - - writer.writerow(list_2) - - csvfile.close() - # Output JSON results if specified - if args.output_json: - results = { - "avg_latency": np.mean(latencies), - "latencies": latencies.tolist(), - "percentiles": dict(zip(percentages, percentiles.tolist())), - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - - -if __name__ == '__main__': - parser = FlexibleArgumentParser( - description='Benchmark the latency of processing a single batch of ' - 'requests till completion.') - parser.add_argument('--model', type=str, default='facebook/opt-125m') - parser.add_argument('--speculative-model', type=str, default=None) - parser.add_argument('--num-speculative-tokens', type=int, default=None) - parser.add_argument('--speculative-draft-tensor-parallel-size', - '-spec-draft-tp', - type=int, - default=None) - parser.add_argument('--tokenizer', type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) - parser.add_argument('--input-len', type=int, default=32) - parser.add_argument('--output-len', type=int, default=128) - parser.add_argument('--batch-size', type=int, default=8) - parser.add_argument('--n', - type=int, - default=1, - help='Number of generated sequences per prompt.') - parser.add_argument('--use-beam-search', action='store_true') - parser.add_argument('--num-iters-warmup', - type=int, - default=10, - help='Number of iterations to run for warmup.') - parser.add_argument('--num-iters', - type=int, - default=30, - help='Number of iterations to run.') - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--enforce-eager', - action='store_true', - help='enforce eager mode and disable CUDA graph') - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument( - '--profile', - action='store_true', - help='profile the generation process of a single batch') - parser.add_argument( - '--profile-result-dir', - type=str, - default=None, - help=('path to save the pytorch profiler output. Can be visualized ' - 'with ui.perfetto.dev or Tensorboard.')) - parser.add_argument( - "--device", - type=str, - default="auto", - choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], - help='device type for vLLM execution, supporting CUDA, OpenVINO and ' - 'CPU.') - parser.add_argument('--block-size', - type=int, - default=16, - help='block size of key/value cache') - parser.add_argument( - '--enable-chunked-prefill', - action='store_true', - help='If True, the prefill requests can be chunked based on the ' - 'max_num_batched_tokens') - parser.add_argument("--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching") - parser.add_argument('--use-v2-block-manager', action='store_true') - parser.add_argument( - "--ray-workers-use-nsight", - action='store_true', - help="If specified, use nsight to profile ray workers", - ) - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') - parser.add_argument( - '--output-json', - type=str, - default=None, - help='Path to save the latency results in JSON format.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--otlp-traces-endpoint', - type=str, - default=None, - help='Target URL to which OpenTelemetry traces will be sent.') - args = parser.parse_args() - main(args) diff --git a/vLLM/GH200/benchmark_throughput_power_gh200.py b/vLLM/GH200/benchmark_throughput_power_gh200.py deleted file mode 100644 index d3833fa..0000000 --- a/vLLM/GH200/benchmark_throughput_power_gh200.py +++ /dev/null @@ -1,331 +0,0 @@ -"""Benchmark the latency of processing a single batch of requests.""" -import argparse -import json -import time -from pathlib import Path -from typing import List, Optional -import numpy as np -import torch -from tqdm import tqdm - -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.inputs import PromptInputs -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.utils import FlexibleArgumentParser -import csv -import os - -# from huggingface_hub import login -# login("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") - -perplexity_dict = {"huggyllama/llama-7b":3.1271538213640806, - "huggyllama/llama-13b":2.9614621865686885, - "mistralai/Mixtral-8x7B-v0.1":2.7458531353012336, - "meta-llama/Llama-2-13b-hf":2.811120439876313, - "huggyllama/llama-30b":2.744248104044345, - "facebook/opt-13b":3.870206998984964, - "Nexusflow/NexusRaven-V2-13B":3.3361824356186327, - "mistralai/Mixtral-8x22B-v0.1":2.5427975971657135, - "meta-llama/Llama-2-7b-hf":2.9624337637748193, - "mistralai/Mistral-7B-v0.3":3.0581070650881257, - "Qwen/Qwen1.5-7B":4.249519567986975, - "google/gemma-1.1-7b-it":13.892232459056668, - "meta-llama/Meta-Llama-3-8B":3.718509102406137, - "facebook/opt-6.7b":4.137426439363523, - "Qwen/Qwen2-7B":4.019159671931102, - "tiiuae/falcon-7b":3.534306161370759, - "bigscience/bloom-7b1":5.208961745879341, - "EleutherAI/gpt-j-6b":3.4668491651555446, - "huggyllama/llama-65b":2.6334970265470727, - "meta-llama/Llama-2-70b-hf":2.491589054514988, - "meta-llama/Meta-Llama-3-70B":2.9904107267016, - "google/gemma-7b":4.149094819615527, - "BAAI/Aquila-7B":4.671358785970369, - "Deci/DeciLM-7B":3.4662699434809126 - } - -def split_string(model_name): - if "/" in model_name: - return model_name.split("/")[-1] - else: - return model_name - -def dump_results(list_1, list_2, model_name, csv_file_name): - - assert len(list_1) == len(list_2) - - csv_file = csv_file_name + str(split_string(model_name)) + ".csv" - file_exists = os.path.exists(csv_file) - - with open(csv_file, 'a', newline = '') as csvfile: - writer = csv.writer(csvfile) - - if not file_exists: - writer.writerow(list_1) - - writer.writerow(list_2) - - csvfile.close() - - - -def main(args: argparse.Namespace): - print(args) - - llm = LLM( - model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - speculative_draft_tensor_parallel_size=args.speculative_draft_tensor_parallel_size, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - max_model_len=args.max_model_len, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - use_v2_block_manager=args.use_v2_block_manager, - enable_chunked_prefill=args.enable_chunked_prefill, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - load_format=args.load_format, - distributed_executor_backend=args.distributed_executor_backend, - otlp_traces_endpoint=args.otlp_traces_endpoint, - enable_prefix_caching=args.enable_prefix_caching, - ) - - #warm up - sampling_params = SamplingParams( - n=args.n, - temperature=0.0 if args.use_beam_search else 1.0, - top_p=1.0, - use_beam_search=args.use_beam_search, - ignore_eos=True, - max_tokens=256, - ) - dummy_prompt_token_ids = np.random.randint(10000, size=(16, 256)) - dummy_inputs: List[PromptInputs] = [{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()] - - print("Warming up...") - for _ in tqdm(range(3), desc="Warmup iterations"): - llm.generate(dummy_inputs, sampling_params=sampling_params, use_tqdm=False) - - batch_size_list = [1,16,32,64] - input_output_list = [128,256,512,1024,2048] - - for bs in batch_size_list: - for input_output in input_output_list: - - args.batch_size = bs - args.output_len = input_output - args.input_len = input_output - - sampling_params = SamplingParams( - n=args.n, - temperature=0.0 if args.use_beam_search else 1.0, - top_p=1.0, - use_beam_search=args.use_beam_search, - ignore_eos=True, - max_tokens=args.output_len, - ) - dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) - dummy_inputs: List[PromptInputs] = [{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()] - - start_time = time.perf_counter() - llm.generate(dummy_inputs, sampling_params=sampling_params, use_tqdm=False) - end_time = time.perf_counter() - latency = end_time - start_time - total_tokens = args.batch_size*(args.input_len + args.input_len) - throughput = total_tokens/latency - - from power_utils import gpuPowerProbe - power_profile = gpuPowerProbe(interval=0.10) - - power_profile.start() - llm.generate(dummy_inputs, sampling_params=sampling_params, use_tqdm=False) - training_powers, training_powers_time = power_profile.stop() - power_profile.destroy() - - avg_power = np.mean(training_powers) - throughput_per_watt_avg_power = float(throughput/avg_power) - - perplexity = perplexity_dict[args.model] - - list_1 = ["Hardware","Num of Hardware","Framework","Model","Input Output Length","Batch Size","Latency","Throughput","area","avg_power","sum_power","Throughput_per_watt_area","Throughput_per_watt_avg","Throughput_per_watt_sum"] - list_2 = ["Nvidia GH200 GPU", args.tensor_parallel_size, "vLLM", args.model, args.input_len, args.batch_size, latency, throughput, None, avg_power, None, None, throughput_per_watt_avg_power,None] - dump_results(list_1, list_2, args.model, csv_file_name = "7b-power_results_") - - list_1 = ["Hardware","Num of Hardware","Framework","Model","Input Output Length","Batch Size","Latency","Throughput"] - list_2 = ["Nvidia GH200 GPU", args.tensor_parallel_size, "vLLM", args.model, args.input_len, args.batch_size, latency, throughput] - dump_results(list_1, list_2, args.model, csv_file_name = "7b-throughput_results_") - - list_1 = ["Hardware","Num of Hardware","Framework","Model","Input Output Length","Batch Size","Latency","Throughput","avg_power","Throughput_per_watt_avg", "Perplexity"] - list_2 = ["Nvidia GH200 GPU", args.tensor_parallel_size, "vLLM", args.model, args.input_len, args.batch_size, latency, throughput, avg_power, throughput_per_watt_avg_power,perplexity] - dump_results(list_1, list_2, args.model, csv_file_name = "7b-perplexity_results_") - - -if __name__ == '__main__': - parser = FlexibleArgumentParser( - description='Benchmark the latency of processing a single batch of ' - 'requests till completion.') - parser.add_argument('--model', type=str, default='facebook/opt-125m') - parser.add_argument('--speculative-model', type=str, default=None) - parser.add_argument('--num-speculative-tokens', type=int, default=None) - parser.add_argument('--speculative-draft-tensor-parallel-size', - '-spec-draft-tp', - type=int, - default=None) - parser.add_argument('--tokenizer', type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) - parser.add_argument('--input-len', type=int, default=32) - parser.add_argument('--output-len', type=int, default=128) - parser.add_argument('--batch-size', type=int, default=8) - parser.add_argument('--n', - type=int, - default=1, - help='Number of generated sequences per prompt.') - parser.add_argument('--use-beam-search', action='store_true') - parser.add_argument('--num-iters-warmup', - type=int, - default=10, - help='Number of iterations to run for warmup.') - parser.add_argument('--num-iters', - type=int, - default=30, - help='Number of iterations to run.') - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--enforce-eager', - action='store_true', - help='enforce eager mode and disable CUDA graph') - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument( - '--profile', - action='store_true', - help='profile the generation process of a single batch') - parser.add_argument( - '--profile-result-dir', - type=str, - default=None, - help=('path to save the pytorch profiler output. Can be visualized ' - 'with ui.perfetto.dev or Tensorboard.')) - parser.add_argument( - "--device", - type=str, - default="auto", - choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], - help='device type for vLLM execution, supporting CUDA, OpenVINO and ' - 'CPU.') - parser.add_argument('--block-size', - type=int, - default=16, - help='block size of key/value cache') - parser.add_argument( - '--enable-chunked-prefill', - action='store_true', - help='If True, the prefill requests can be chunked based on the ' - 'max_num_batched_tokens') - parser.add_argument("--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching") - parser.add_argument('--use-v2-block-manager', action='store_true') - parser.add_argument( - "--ray-workers-use-nsight", - action='store_true', - help="If specified, use nsight to profile ray workers", - ) - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') - parser.add_argument( - '--output-json', - type=str, - default=None, - help='Path to save the latency results in JSON format.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--otlp-traces-endpoint', - type=str, - default=None, - help='Target URL to which OpenTelemetry traces will be sent.') - args = parser.parse_args() - main(args) \ No newline at end of file diff --git a/vLLM/GH200/build-container.sh b/vLLM/GH200/build-container.sh deleted file mode 100644 index 7deaf3c..0000000 --- a/vLLM/GH200/build-container.sh +++ /dev/null @@ -1,9 +0,0 @@ - -# Obtain your credentials by following instructions at -# https://docs.nvidia.com/ngc/gpu-cloud/ngc-private-registry-user-guide/index.html -export SINGULARITY_DOCKER_USERNAME="$oauthtoken" -export SINGULARITY_DOCKER_PASSWORD=YOUR_PASSWORD -export APPTAINER_DOCKER_USERNAME="$oauthtoken" -export APPTAINER_DOCKER_PASSWORD=YOUR_PASSWORD - -apptainer build vllm-gh200.sif vllm-gh200.def diff --git a/vLLM/GH200/power_utils.py b/vLLM/GH200/power_utils.py deleted file mode 100644 index 5d7906c..0000000 --- a/vLLM/GH200/power_utils.py +++ /dev/null @@ -1,68 +0,0 @@ -import multiprocessing -import os -import time -from py3nvml.py3nvml import nvmlDeviceGetPowerUsage, \ - nvmlDeviceGetCount, \ - nvmlDeviceGetHandleByIndex, \ - nvmlInit, \ - nvmlShutdown - -class gpuPowerProbe(object): - def __init__(self, interval, gpu_id=-1): - self.interval = multiprocessing.Value('d', interval) - self.len = int(7200/interval) - self.powers = multiprocessing.Array('d', self.len) - self.times = multiprocessing.Array('d', self.len) - self.gpu_id = multiprocessing.Value('i', gpu_id) - self.process = None - self.prevTime = multiprocessing.Value('d',time.time()) - self.halt = multiprocessing.Value('i',1) - self.count = multiprocessing.Value('i',0) - self.isrunning = multiprocessing.Value('i',0) - self.alive = multiprocessing.Value('i',0) - self.init() - - def _getGpuPower(self, powers, times, gpu_id, count, halt, alive, isrunning, prevTime, interval): - nvmlInit() - while (alive.value): - while (not halt.value): - isrunning.value = 1 - if gpu_id.value > -1: - power = nvmlDeviceGetPowerUsage(nvmlDeviceGetHandleByIndex(gpu_id.value)) - else: - power = 0 - num_gpus = nvmlDeviceGetCount() - for i in range(num_gpus): - power += nvmlDeviceGetPowerUsage(nvmlDeviceGetHandleByIndex(i)) - - new_time = time.time() - while (new_time-prevTime.value < interval.value): - new_time = time.time() - powers[count.value] = power - times[count.value] = new_time-prevTime.value - count.value += 1 - prevTime.value = new_time - isrunning.value = 0 - nvmlShutdown() - - def init(self): - self.halt.value = 1 - self.alive.value = 1 - self.process = multiprocessing.Process(target = self._getGpuPower, args = (self.powers, self.times, self.gpu_id, - self.count, self.halt, self.alive, self.isrunning, self.prevTime, self.interval)) - self.process.start() - - def start(self): - self.count.value = 0 - self.prevTime.value = time.time() - self.halt.value = 0 - - def stop(self): - self.halt.value = 1 - while (self.isrunning.value): - pass - return self.powers[:self.count.value], self.times[:self.count.value] - - def destroy(self): - self.alive.value = 0 - self.process.join() \ No newline at end of file diff --git a/vLLM/GH200/run-container.sh b/vLLM/GH200/run-container.sh deleted file mode 100644 index 8d79095..0000000 --- a/vLLM/GH200/run-container.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -l -#COBALT -t 6:00:00 -n 1 -q gpu_gh200 --jobname v_7b-models - -module use /soft/modulefiles/ -module load conda/2024.03.04 -source /soft/datascience/miniconda3/bin/activate - -apptainer exec --nv --no-mount /gpfs/jlse-fs0 \ - --bind /vast/users/sraskar/gh200/llm_research/:/vast/users/sraskar/gh200/llm_research/ \ - --bind /vast/users/sraskar/model_weights/GGUF_weights/:/vast/users/sraskar/model_weights/GGUF_weights \ - --bind /vast/users/sraskar/h100/tensorRT/trt_weights:/vast/users/sraskar/h100/tensorRT/trt_weights \ - --bind /vast/users/sraskar/mi250/hf:/vast/users/sraskar/mi250/hf vllm-gh200.sif \ - /vast/users/sraskar/gh200/llm_research/vllm/benchmarks/run-models.sh - - diff --git a/vLLM/GH200/run-models.sh b/vLLM/GH200/run-models.sh deleted file mode 100644 index c0df6be..0000000 --- a/vLLM/GH200/run-models.sh +++ /dev/null @@ -1,18 +0,0 @@ -export HF_TOKEN="your_hugging_face_token" -export HF_HOME="/hf" -export HF_DATASETS_CACHE="/hf" - - -source /vast/users/sraskar/gh200/llm_research/container-build/vllm/wheels/setup_wheels.sh -cd /vast/users/sraskar/gh200/llm_research/vllm/benchmarks/ - -for model_name in "meta-llama/Meta-Llama-3-8B";do - for tensor_parallel in 1; do - for batch_size in 1 16 32 64; do - for input_output_length in 128 256 512 1024 2048; do - python3 benchmark_throughput_power_gh200.py --device cuda --model=$model_name --tensor-parallel-size=$tensor_parallel --input-len=$input_output_length --output-len=$input_output_length --batch-size=$batch_size --dtype="float16" --trust-remote-code - done - done - done -done - diff --git a/vLLM/GH200/vllm-gh200.def b/vLLM/GH200/vllm-gh200.def deleted file mode 100644 index 89f79c4..0000000 --- a/vLLM/GH200/vllm-gh200.def +++ /dev/null @@ -1,49 +0,0 @@ -Bootstrap: docker -From: nvcr.io/nvidia/pytorch:24.06-py3 - - -%post - -NOW=`date` -echo "export NOW=\"${NOW}\"" >> $SINGULARITY_ENVIRONMENT - -mkdir /extra -mkdir /xdisk - -chown root:root /usr/lib -apt update -y && apt install -y build-essential curl openssh-server openssh-client pdsh - -pip install --upgrade pip wheel - -pip install \ - accelerate \ - deepspeed \ - openai \ - peft \ - pyarrow==14.0.2 \ - sentencepiece \ - tiktoken \ - transformers \ - trl - -pip install stanford-stk --no-deps - -pip install \ - aioprometheus \ - fastapi \ - fschat[model_worker,webui] \ - lm-format-enforcer==0.10.3 \ - outlines \ - prometheus-fastapi-instrumentator \ - protobuf==3.20.3 \ - ray==2.9.2 \ - typer==0.9.4 \ - uvicorn - - - - - -%label -This container uses nvidia ubuntu 22.04 as base and installs requirements to run vLLM on GH200. -Maintaier: Sid Raskar(sraskar@anl.gov) \ No newline at end of file diff --git a/vLLM/Gaudi2/README.MD b/vLLM/Gaudi2/README.MD deleted file mode 100644 index e69de29..0000000 diff --git a/vLLM/H100/README.MD b/vLLM/H100/README.MD deleted file mode 100644 index e69de29..0000000 diff --git a/vLLM/MI250/README.MD b/vLLM/MI250/README.MD deleted file mode 100644 index e69de29..0000000 diff --git a/vLLM/README.md b/vLLM/README.md deleted file mode 100644 index 4cf1b20..0000000 --- a/vLLM/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# vLLM - -vLLM