From 47d0f04f7699664aea9590318f10190a13bbc0e4 Mon Sep 17 00:00:00 2001 From: Sid Raskar Date: Sun, 8 Sep 2024 22:31:21 +0000 Subject: [PATCH] trtllm h100 content --- TensorRT-LLM/H100/README.MD | 56 ++ TensorRT-LLM/H100/p-llama2-7b.sh | 32 ++ TensorRT-LLM/H100/q-llama2-7b.sh | 37 ++ TensorRT-LLM/H100/run.py | 550 ++++++++++++++++++++ TensorRT-LLM/H100/run_power.py | 619 +++++++++++++++++++++++ TensorRT-LLM/H100/run_precision_bench.py | 563 +++++++++++++++++++++ TensorRT-LLM/H100/utils.py | 373 ++++++++++++++ 7 files changed, 2230 insertions(+) create mode 100755 TensorRT-LLM/H100/p-llama2-7b.sh create mode 100755 TensorRT-LLM/H100/q-llama2-7b.sh create mode 100644 TensorRT-LLM/H100/run.py create mode 100644 TensorRT-LLM/H100/run_power.py create mode 100644 TensorRT-LLM/H100/run_precision_bench.py create mode 100644 TensorRT-LLM/H100/utils.py diff --git a/TensorRT-LLM/H100/README.MD b/TensorRT-LLM/H100/README.MD index e69de29..1c2d0c6 100644 --- a/TensorRT-LLM/H100/README.MD +++ b/TensorRT-LLM/H100/README.MD @@ -0,0 +1,56 @@ +# TRT-LLM on H100 + +1. Setup Virtual Environment + + ```bash + module use /soft/modulefiles/ + module load conda + module load openmpi/4.1.1-nvhpc + + conda create -n TensorRT_LLM python=3.10 + conda activate TensorRT_LLM + conda install -c conda-forge mpi4py openmpi + + ``` + +2. Install Dependancies + ```bash + git clone https://github.com/NVIDIA/TensorRT-LLM.git + + cd TensorRT-LLM + cd examples/llama/ + + MPICC=$(which mpicc) MPICXX=$(which mpicxx) pip install -r requirements.txt + ``` + +3. Running single Benchmark + ```bash + + export dir_1= + export dir_2= + export dir_3= + + python convert_checkpoint.py --tp_size=1 --model_dir=$dir_1 --output_dir=$dir_2 --dtype=float16 + + trtllm-build --checkpoint_dir=$dir_2 --output_dir=$dir_3 --gemm_plugin=float16 --max_batch_size=1 --max_input_len=128 --max_output_len=128 + + python3 ../run.py --model_name="mistral_7b" --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=128 --max_input_length=$input_output_length --run_profiling --batch_size=1 + ``` + +4. Replaces or Copy files `run_power.py`, `run_precision_bench.py`, `utils.py` and `run.py` from this directory to clones trt-llm directory. + +5. Run benchmarks. +Use `p-llama2-7b.sh` to run power benchmakrs. +Use `q-llama2-7b.sh` to run precision benchmarks. + + + + + + + + + + + + diff --git a/TensorRT-LLM/H100/p-llama2-7b.sh b/TensorRT-LLM/H100/p-llama2-7b.sh new file mode 100755 index 0000000..2b846ca --- /dev/null +++ b/TensorRT-LLM/H100/p-llama2-7b.sh @@ -0,0 +1,32 @@ +export HF_TOKEN="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" +export HF_HOME="/vast/users/sraskar/mi250/hf/hub" +export HF_DATASETS_CACHE="/vast/users/sraskar/mi250/hf/hub" + +pip install pynvml==11.5.0 +pip install pydantic-core==2.18.1 +pip install psutil +pip install py3nvml + +cd /vast/users/sraskar/h100/llm_research/tensorRT/new/TensorRT-LLM/examples/llama/ + +model_name="meta-llama/Llama-2-7b-hf" +dir_1="/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9" +dir_2="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_weights/Llama2-7b" +dir_3="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_binaries/Llama2-7b" + + + + +for tensor_parallel in 1; do + for precision in "float16"; do + rm -rf $dir_2/* + rm -rf $dir_3/* + python convert_checkpoint.py --workers=64 --tp_size=$tensor_parallel --model_dir=$dir_1 --output_dir=$dir_2 --dtype=$precision + for batch_size in 1 16 32 64; do + for input_output_length in 1024; do + trtllm-build --workers=64 --tp_size=$tensor_parallel --checkpoint_dir=$dir_2 --output_dir=$dir_3 --gemm_plugin=$precision --gpt_attention_plugin=$precision --max_batch_size=$batch_size --max_input_len=$input_output_length + mpirun -np $tensor_parallel python3 ../run_power.py --model_name=$model_name --tp_size=$tensor_parallel --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=$input_output_length --max_input_length=$input_output_length --run_profiling --batch_size=$batch_size + done + done + done +done \ No newline at end of file diff --git a/TensorRT-LLM/H100/q-llama2-7b.sh b/TensorRT-LLM/H100/q-llama2-7b.sh new file mode 100755 index 0000000..60249ce --- /dev/null +++ b/TensorRT-LLM/H100/q-llama2-7b.sh @@ -0,0 +1,37 @@ +export HF_TOKEN="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" +export HF_HOME="/vast/users/sraskar/mi250/hf/hub" +export HF_DATASETS_CACHE="/vast/users/sraskar/mi250/hf/hub" + +pip install pynvml==11.5.0 +pip install pydantic-core==2.18.1 +# pip install psutil +pip install psutil==5.9.8 + +pip install pydantic==2.7.0 +pip install regex==2024.5.15 + +cd /vast/users/sraskar/h100/llm_research/tensorRT/new/TensorRT-LLM/examples/llama/ + +model_name="meta-llama/Llama-2-7b-hf" +dir_1="/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9" +# dir_2="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_weights/Llama2-7b" +# dir_3="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_binaries/Llama2-7b" +dir_2="." +dir_3="." + + +for tensor_parallel in 1; do + for precision in "full_prec" "int8_sq" "int4_awq"; do + for kv_cache_precision in "int8" "fp8"; do + # rm -rf $dir_2/* + # rm -rf $dir_3/* + python ../quantization/quantize.py --model_dir $dir_1 --dtype float16 --qformat $precision --kv_cache_dtype $kv_cache_precision --output_dir $dir_2 --calib_size 10 --tp_size $tensor_parallel --batch_size=1 + for batch_size in 1 16 32 64; do + for input_output_length in 1024; do + trtllm-build --workers=48 --tp_size=$tensor_parallel --checkpoint_dir=$dir_2 --output_dir=$dir_3 --max_batch_size=$batch_size --max_input_len=$input_output_length + mpirun -np $tensor_parallel python3 ../run_precision.py --qformat $precision --kv_cache_dtype $kv_cache_precision --model_name=$model_name --tp_size=$tensor_parallel --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=$input_output_length --max_input_length=$input_output_length --run_profiling --batch_size=$batch_size + done + done + done + done +done \ No newline at end of file diff --git a/TensorRT-LLM/H100/run.py b/TensorRT-LLM/H100/run.py new file mode 100644 index 0000000..f1994d3 --- /dev/null +++ b/TensorRT-LLM/H100/run.py @@ -0,0 +1,550 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import ast +import csv +import os +from pathlib import Path + +import numpy as np +import torch +from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, + add_common_args, load_tokenizer, read_decoder_start_token_id, + read_model_name, supports_inflight_batching, + throttle_generator) + +import tensorrt_llm +import tensorrt_llm.profiler +from tensorrt_llm.logger import logger +from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner + +if PYTHON_BINDINGS: + from tensorrt_llm.runtime import ModelRunnerCpp + + +def parse_arguments(args=None): + # see `add_common_args` for extended list of arguments + parser = argparse.ArgumentParser() + + parser.add_argument('--pp_size', type=int, default = 1) + parser.add_argument('--tp_size', type=int, default = 1) + parser.add_argument('--moe_ep_size', type=int, default = 1) + parser.add_argument('--moe_tp_size', type=int, default = 1) + parser.add_argument('--model_name', type=str, required=True) + parser.add_argument('--batch_size', type=int, default=1, help='Batch Size') + parser.add_argument('--precision', type=str, default="float16", help="precision") + parser.add_argument('--int8_kv_cache', default=False, action='store_true', help="Int8 KV Cache.") + + parser.add_argument('--qformat', type=str, default="float16", help="precision") + parser.add_argument('--kv_cache_dtype', type=str, default="float16", help="precision") + + parser.add_argument('--max_input_length', type=int, default=923) + parser.add_argument('--max_output_len', type=int, required=True) + parser.add_argument( + '--input_text', + type=str, + nargs='+', + default=["Born in north-east France, Soyer trained as a"]) + parser.add_argument( + '--input_file', + type=str, + help= + 'CSV or Numpy file containing tokenized input. Alternative to text input.', + default=None) + parser.add_argument('--output_csv', + type=str, + help='CSV file where the tokenized output is stored.', + default=None) + parser.add_argument('--output_npy', + type=str, + help='Numpy file where the tokenized output is stored.', + default=None) + parser.add_argument( + '--output_logits_npy', + type=str, + help= + 'Numpy file where the generation logits are stored. Use only when num_beams==1', + default=None) + parser.add_argument('--output_log_probs_npy', + type=str, + help='Numpy file where the log_probs are stored', + default=None) + parser.add_argument('--output_cum_log_probs_npy', + type=str, + help='Numpy file where the cum_log_probs are stored', + default=None) + parser.add_argument( + '--run_profiling', + default=False, + action='store_true', + help="Run several 10 iterations to profile the inference latencies.") + parser = add_common_args(parser) + + return parser.parse_args(args=args) + + +def parse_input(tokenizer, + input_text=None, + prompt_template=None, + input_file=None, + add_special_tokens=True, + max_input_length=923, + pad_id=None, + num_prepend_vtokens=[], + model_name=None, + model_version=None): + if pad_id is None: + pad_id = tokenizer.pad_token_id + + batch_input_ids = [] + if input_file is None: + for curr_text in input_text: + if prompt_template is not None: + curr_text = prompt_template.format(input_text=curr_text) + input_ids = tokenizer.encode(curr_text, + add_special_tokens=add_special_tokens, + truncation=True, + max_length=max_input_length) + batch_input_ids.append(input_ids) + else: + if input_file.endswith('.csv'): + with open(input_file, 'r') as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for line in csv_reader: + input_ids = np.array(line, dtype='int32') + batch_input_ids.append(input_ids[-max_input_length:]) + elif input_file.endswith('.npy'): + inputs = np.load(input_file) + for row in inputs: + input_ids = row[row != pad_id] + batch_input_ids.append(input_ids[-max_input_length:]) + elif input_file.endswith('.txt'): + with open(input_file, 'r', encoding='utf-8', + errors='replace') as txt_file: + input_text = txt_file.readlines() + batch_input_ids = tokenizer( + input_text, + add_special_tokens=add_special_tokens, + truncation=True, + max_length=max_input_length)["input_ids"] + else: + print('Input file format not supported.') + raise SystemExit + + if num_prepend_vtokens: + assert len(num_prepend_vtokens) == len(batch_input_ids) + base_vocab_size = tokenizer.vocab_size - len( + tokenizer.special_tokens_map.get('additional_special_tokens', [])) + for i, length in enumerate(num_prepend_vtokens): + batch_input_ids[i] = list( + range(base_vocab_size, + base_vocab_size + length)) + batch_input_ids[i] + + if input_file is None and 'GLM' in model_name and model_version == 'glm': + for ids in batch_input_ids: + ids.append(tokenizer.sop_token_id) + + batch_input_ids = [ + torch.tensor(x, dtype=torch.int32) for x in batch_input_ids + ] + return batch_input_ids + + +def print_output(tokenizer, + output_ids, + input_lengths, + sequence_lengths, + output_csv=None, + output_npy=None, + context_logits=None, + generation_logits=None, + cum_log_probs=None, + log_probs=None, + output_logits_npy=None, + output_cum_log_probs_npy=None, + output_log_probs_npy=None): + batch_size, num_beams, _ = output_ids.size() + if output_csv is None and output_npy is None: + for batch_idx in range(batch_size): + inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist( + ) + input_text = tokenizer.decode(inputs) + print(f'Input [Text {batch_idx}]: \"{input_text}\"') + for beam in range(num_beams): + output_begin = input_lengths[batch_idx] + output_end = sequence_lengths[batch_idx][beam] + outputs = output_ids[batch_idx][beam][ + output_begin:output_end].tolist() + output_text = tokenizer.decode(outputs) + print( + f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"') + + output_ids = output_ids.reshape((-1, output_ids.size(2))) + if output_csv is not None: + output_file = Path(output_csv) + output_file.parent.mkdir(exist_ok=True, parents=True) + outputs = output_ids.tolist() + with open(output_file, 'w') as csv_file: + writer = csv.writer(csv_file, delimiter=',') + writer.writerows(outputs) + + if output_npy is not None: + output_file = Path(output_npy) + output_file.parent.mkdir(exist_ok=True, parents=True) + outputs = np.array(output_ids.cpu().contiguous(), dtype='int32') + np.save(output_file, outputs) + + # Save context logits + if context_logits is not None and output_logits_npy is not None: + context_logits = torch.cat(context_logits, axis=0) + vocab_size_padded = context_logits.shape[-1] + context_logits = context_logits.reshape([1, -1, vocab_size_padded]) + + output_context_logits_npy = output_logits_npy.split( + '.npy')[0] + "_context" + output_context_logits_file = Path(output_context_logits_npy) + context_outputs = np.array( + context_logits.squeeze(0).cpu().contiguous(), + dtype='float32') # [promptLengthSum, vocabSize] + np.save(output_context_logits_file, context_outputs) + + # Save generation logits + if generation_logits is not None and output_logits_npy is not None and num_beams == 1: + output_generation_logits_npy = output_logits_npy.split( + '.npy')[0] + "_generation" + output_generation_logits_file = Path(output_generation_logits_npy) + generation_outputs = np.array(generation_logits.cpu().contiguous(), + dtype='float32') + np.save(output_generation_logits_file, generation_outputs) + + # Save cum log probs + if cum_log_probs is not None and output_cum_log_probs_npy is not None: + cum_log_probs_file = Path(output_cum_log_probs_npy) + cum_log_probs_outputs = np.array(cum_log_probs.cpu().contiguous(), + dtype='float32') + np.save(cum_log_probs_file, cum_log_probs_outputs) + + # Save cum log probs + if log_probs is not None and output_log_probs_npy is not None: + log_probs_file = Path(output_log_probs_npy) + log_probs_outputs = np.array(log_probs.cpu().contiguous(), + dtype='float32') + np.save(log_probs_file, log_probs_outputs) + + +def main(args): + runtime_rank = tensorrt_llm.mpi_rank() + logger.set_level(args.log_level) + + # different handling if encoder-decoder models + is_enc_dec = { + name + for name in os.listdir(args.engine_dir) + if os.path.isdir(os.path.join(args.engine_dir, name)) + } == {'encoder', 'decoder'} + if is_enc_dec: + logger.warning( + "This path is an encoder-decoder model. Using different handling.") + assert not args.use_py_session, "Encoder-decoder models don't have a unified python runtime, please use its own examples/enc_dec/run.py instead." + + model_name, model_version = read_model_name( + args.engine_dir) if not is_enc_dec else ("", "") + if args.tokenizer_dir is None and model_name in DEFAULT_HF_MODEL_DIRS: + logger.warning( + "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect." + ) + args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name] + + tokenizer, pad_id, end_id = load_tokenizer( + tokenizer_dir=args.tokenizer_dir, + vocab_file=args.vocab_file, + model_name=model_name, + model_version=model_version, + tokenizer_type=args.tokenizer_type, + ) + + if args.end_id: + end_id = args.end_id + + stop_words_list = None + if args.stop_words: + stop_words_list = tensorrt_llm.runtime.decode_words_list( + args.stop_words, tokenizer) + + bad_words_list = None + if args.bad_words: + bad_words_list = tensorrt_llm.runtime.decode_words_list( + args.bad_words, tokenizer) + + prompt_template = None + if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES: + prompt_template = DEFAULT_PROMPT_TEMPLATES[model_name] + batch_input_ids = parse_input(tokenizer=tokenizer, + input_text=args.input_text, + prompt_template=prompt_template, + input_file=args.input_file, + add_special_tokens=args.add_special_tokens, + max_input_length=args.max_input_length, + pad_id=pad_id, + num_prepend_vtokens=args.num_prepend_vtokens, + model_name=model_name, + model_version=model_version) + + if is_enc_dec: + encoder_input_ids = batch_input_ids + decoder_start_token_id = read_decoder_start_token_id( + os.path.join(args.engine_dir, "decoder")) + decoder_input_ids = [ + torch.tensor([decoder_start_token_id], dtype=torch.int32) + for _ in batch_input_ids + ] + + input_lengths = [x.size(0) for x in decoder_input_ids + ] if is_enc_dec else [x.size(0) for x in batch_input_ids] + encoder_input_lengths = [x.size(0) + for x in encoder_input_ids] if is_enc_dec else None + + if not args.use_py_session and not supports_inflight_batching( + os.path.join(args.engine_dir, "decoder") if is_enc_dec else args. + engine_dir): + logger.warning( + "The given engine does not support in-flight batching, fallback to python session" + ) + args.use_py_session = True + + if not PYTHON_BINDINGS and not args.use_py_session: + logger.warning( + "Python bindings of C++ session is unavailable, fallback to Python session." + ) + args.use_py_session = True + if args.debug_mode and not args.use_py_session: + logger.warning( + "Debug mode is not supported in C++ session for now, fallback to Python session." + ) + args.use_py_session = True + if args.return_all_generated_tokens and args.use_py_session: + raise ValueError( + "Returning all the generated tokens at each step is not supported in the Python session, use C++ session instead." + ) + if (not args.return_all_generated_tokens) and args.streaming and ( + args.num_beams > 1): + logger.warning( + "Setting return_all_generated_tokens to True since streaming AND beam search are done simultaneously. " + "Returning the full beams at each streaming step is needed because beam search + streaming can change previous outputs. " + "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)." + ) + args.return_all_generated_tokens = True + runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp + runner_kwargs = dict( + engine_dir=args.engine_dir, + lora_dir=args.lora_dir, + rank=runtime_rank, + debug_mode=args.debug_mode, + lora_ckpt_source=args.lora_ckpt_source, + gpu_weights_percent=args.gpu_weights_percent, + ) + if not args.use_py_session: + runner_kwargs.update(is_enc_dec=is_enc_dec) + if args.medusa_choices is not None: + args.medusa_choices = ast.literal_eval(args.medusa_choices) + assert args.temperature == 1.0, "Medusa should use temperature == 1.0" + assert args.num_beams == 1, "Medusa should use num_beams == 1" + runner_kwargs.update(medusa_choices=args.medusa_choices) + if not args.use_py_session: + runner_kwargs.update( + max_batch_size=len(batch_input_ids), + max_input_len=max( + encoder_input_lengths if is_enc_dec else input_lengths), + max_output_len=args.max_output_len, + max_beam_width=args.num_beams, + max_attention_window_size=args.max_attention_window_size, + sink_token_length=args.sink_token_length, + max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, + kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, + kv_cache_free_gpu_memory_fraction=args. + kv_cache_free_gpu_memory_fraction, + enable_chunked_context=args.enable_chunked_context, + multi_block_mode=args.multi_block_mode) + runner = runner_cls.from_dir(**runner_kwargs) + + with torch.no_grad(): + outputs = runner.generate( + batch_input_ids=decoder_input_ids + if is_enc_dec else batch_input_ids, + encoder_input_ids=encoder_input_ids if is_enc_dec else None, + max_new_tokens=args.max_output_len, + max_attention_window_size=args.max_attention_window_size, + sink_token_length=args.sink_token_length, + end_id=end_id, + pad_id=pad_id, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + num_beams=args.num_beams, + length_penalty=args.length_penalty, + early_stopping=args.early_stopping, + repetition_penalty=args.repetition_penalty, + presence_penalty=args.presence_penalty, + frequency_penalty=args.frequency_penalty, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, + lora_uids=args.lora_task_uids, + prompt_table=args.prompt_table_path, + prompt_tasks=args.prompt_tasks, + streaming=args.streaming, + output_sequence_lengths=True, + no_repeat_ngram_size=args.no_repeat_ngram_size, + return_dict=True, + medusa_choices=args.medusa_choices, + return_all_generated_tokens=args.return_all_generated_tokens) + torch.cuda.synchronize() + + if args.streaming: + for curr_outputs in throttle_generator(outputs, + args.streaming_interval): + if runtime_rank == 0: + output_ids = curr_outputs['output_ids'] + sequence_lengths = curr_outputs['sequence_lengths'] + cum_log_probs = None + log_probs = None + if args.output_cum_log_probs_npy != None: + cum_log_probs = outputs['cum_log_probs'] + if args.output_log_probs_npy != None: + log_probs = outputs['log_probs'] + print_output( + tokenizer, + output_ids, + input_lengths, + sequence_lengths, + output_csv=args.output_csv, + output_npy=args.output_npy, + cum_log_probs=cum_log_probs, + log_probs=log_probs, + output_cum_log_probs_npy=args.output_cum_log_probs_npy, + output_log_probs_npy=args.output_log_probs_npy) + else: + if runtime_rank == 0: + output_ids = outputs['output_ids'] + sequence_lengths = outputs['sequence_lengths'] + context_logits = None + generation_logits = None + cum_log_probs = None + log_probs = None + if runner.gather_context_logits: + context_logits = outputs['context_logits'] + if runner.gather_generation_logits: + generation_logits = outputs['generation_logits'] + if args.output_cum_log_probs_npy != None: + cum_log_probs = outputs['cum_log_probs'] + if args.output_log_probs_npy != None: + log_probs = outputs['log_probs'] + print_output(tokenizer, + output_ids, + input_lengths, + sequence_lengths, + output_csv=args.output_csv, + output_npy=args.output_npy, + context_logits=context_logits, + generation_logits=generation_logits, + output_logits_npy=args.output_logits_npy, + cum_log_probs=cum_log_probs, + log_probs=log_probs, + output_cum_log_probs_npy=args.output_cum_log_probs_npy, + output_log_probs_npy=args.output_log_probs_npy) + + if args.run_profiling: + ite = 1 + # warmup + for _ in range(ite): + with torch.no_grad(): + outputs = runner.generate( + batch_input_ids, + max_new_tokens=args.max_output_len, + max_attention_window_size=args.max_attention_window_size, + end_id=end_id, + pad_id=pad_id, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + num_beams=args.num_beams, + length_penalty=args.length_penalty, + early_stopping=args.early_stopping, + repetition_penalty=args.repetition_penalty, + presence_penalty=args.presence_penalty, + frequency_penalty=args.frequency_penalty, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != + None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, + lora_uids=args.lora_task_uids, + prompt_table=args.prompt_table_path, + prompt_tasks=args.prompt_tasks, + streaming=args.streaming, + output_sequence_lengths=True, + return_dict=True, + return_all_generated_tokens=args.return_all_generated_tokens + ) + torch.cuda.synchronize() + + tensorrt_llm.profiler.start("tmp") + ite=1 + for _ in range(ite): + with torch.no_grad(): + outputs = runner.generate( + batch_input_ids, + max_new_tokens=args.max_output_len, + max_attention_window_size=args.max_attention_window_size, + end_id=end_id, + pad_id=pad_id, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + num_beams=args.num_beams, + length_penalty=args.length_penalty, + early_stopping=args.early_stopping, + repetition_penalty=args.repetition_penalty, + presence_penalty=args.presence_penalty, + frequency_penalty=args.frequency_penalty, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != + None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, + lora_uids=args.lora_task_uids, + prompt_table=args.prompt_table_path, + prompt_tasks=args.prompt_tasks, + streaming=args.streaming, + output_sequence_lengths=True, + return_dict=True, + return_all_generated_tokens=args.return_all_generated_tokens + ) + torch.cuda.synchronize() + tensorrt_llm.profiler.stop("tmp") + + print( + f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec" + ) + + +if __name__ == '__main__': + args = parse_arguments() + main(args) diff --git a/TensorRT-LLM/H100/run_power.py b/TensorRT-LLM/H100/run_power.py new file mode 100644 index 0000000..9b3de62 --- /dev/null +++ b/TensorRT-LLM/H100/run_power.py @@ -0,0 +1,619 @@ + +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import ast +import csv +import os +from pathlib import Path + +# from huggingface_hub import login +# login("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") + +from power_utils import gpuPowerProbe +power_profile = gpuPowerProbe(interval=0.10) + + +import numpy as np +import torch +from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, + add_common_args, load_tokenizer, read_model_name, + throttle_generator) + +import tensorrt_llm +import tensorrt_llm.profiler +from tensorrt_llm.logger import logger +from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner + +if PYTHON_BINDINGS: + from tensorrt_llm.runtime import ModelRunnerCpp + + +def parse_arguments(args=None): + parser = argparse.ArgumentParser() + + parser.add_argument('--pp_size', type=int, default = 1) + parser.add_argument('--tp_size', type=int, default = 1) + parser.add_argument('--moe_ep_size', type=int, default = 1) + parser.add_argument('--moe_tp_size', type=int, default = 1) + parser.add_argument('--model_name', type=str, required=True) + parser.add_argument('--batch_size', type=int, default=1, help='Batch Size') + parser.add_argument('--precision', type=str, default="float16", help="precision") + parser.add_argument('--int8_kv_cache', default=False, action='store_true', help="Int8 KV Cache.") + + parser.add_argument('--max_input_length', type=int, default=923) + parser.add_argument('--max_output_len', type=int, required=True) + parser.add_argument( + '--input_text', + type=str, + nargs='+', + default=["Born in north-east France, Soyer trained as a"]) + parser.add_argument( + '--input_file', + type=str, + help= + 'CSV or Numpy file containing tokenized input. Alternative to text input.', + default=None) + parser.add_argument('--output_csv', + type=str, + help='CSV file where the tokenized output is stored.', + default=None) + parser.add_argument('--output_npy', + type=str, + help='Numpy file where the tokenized output is stored.', + default=None) + parser.add_argument( + '--output_logits_npy', + type=str, + help= + 'Numpy file where the generation logits are stored. Use only when num_beams==1', + default=None) + parser.add_argument('--output_log_probs_npy', + type=str, + help='Numpy file where the log_probs are stored', + default=None) + parser.add_argument('--output_cum_log_probs_npy', + type=str, + help='Numpy file where the cum_log_probs are stored', + default=None) + parser.add_argument( + '--run_profiling', + default=False, + action='store_true', + help="Run several 10 iterations to profile the inference latencies.") + parser = add_common_args(parser) + + return parser.parse_args(args=args) + + +def parse_input(tokenizer, + input_text=None, + prompt_template=None, + input_file=None, + add_special_tokens=True, + max_input_length=923, + pad_id=None, + num_prepend_vtokens=[], + model_name=None, + model_version=None): + if pad_id is None: + pad_id = tokenizer.pad_token_id + + batch_input_ids = [] + if input_file is None: + for curr_text in input_text: + if prompt_template is not None: + curr_text = prompt_template.format(input_text=curr_text) + input_ids = tokenizer.encode(curr_text, + add_special_tokens=add_special_tokens, + truncation=True, + max_length=max_input_length) + batch_input_ids.append(input_ids) + else: + if input_file.endswith('.csv'): + with open(input_file, 'r') as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for line in csv_reader: + input_ids = np.array(line, dtype='int32') + batch_input_ids.append(input_ids[-max_input_length:]) + elif input_file.endswith('.npy'): + inputs = np.load(input_file) + for row in inputs: + input_ids = row[row != pad_id] + batch_input_ids.append(input_ids[-max_input_length:]) + elif input_file.endswith('.txt'): + with open(input_file, 'r', encoding='utf-8', + errors='replace') as txt_file: + input_text = txt_file.readlines() + batch_input_ids = tokenizer( + input_text, + add_special_tokens=add_special_tokens, + truncation=True, + max_length=max_input_length)["input_ids"] + else: + print('Input file format not supported.') + raise SystemExit + + if num_prepend_vtokens: + assert len(num_prepend_vtokens) == len(batch_input_ids) + base_vocab_size = tokenizer.vocab_size - len( + tokenizer.special_tokens_map.get('additional_special_tokens', [])) + for i, length in enumerate(num_prepend_vtokens): + batch_input_ids[i] = list( + range(base_vocab_size, + base_vocab_size + length)) + batch_input_ids[i] + + if model_name == 'ChatGLMForCausalLM' and model_version == 'glm': + for ids in batch_input_ids: + ids.append(tokenizer.sop_token_id) + + batch_input_ids = [ + torch.tensor(x, dtype=torch.int32) for x in batch_input_ids + ] + return batch_input_ids + + +def print_output(tokenizer, + output_ids, + input_lengths, + sequence_lengths, + output_csv=None, + output_npy=None, + context_logits=None, + generation_logits=None, + cum_log_probs=None, + log_probs=None, + output_logits_npy=None, + output_cum_log_probs_npy=None, + output_log_probs_npy=None): + batch_size, num_beams, _ = output_ids.size() + if output_csv is None and output_npy is None: + for batch_idx in range(batch_size): + inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist( + ) + input_text = tokenizer.decode(inputs) + print(f'Input [Text {batch_idx}]: \"{input_text}\"') + for beam in range(num_beams): + output_begin = input_lengths[batch_idx] + output_end = sequence_lengths[batch_idx][beam] + outputs = output_ids[batch_idx][beam][ + output_begin:output_end].tolist() + output_text = tokenizer.decode(outputs) + print( + f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"') + + output_ids = output_ids.reshape((-1, output_ids.size(2))) + if output_csv is not None: + output_file = Path(output_csv) + output_file.parent.mkdir(exist_ok=True, parents=True) + outputs = output_ids.tolist() + with open(output_file, 'w') as csv_file: + writer = csv.writer(csv_file, delimiter=',') + writer.writerows(outputs) + + if output_npy is not None: + output_file = Path(output_npy) + output_file.parent.mkdir(exist_ok=True, parents=True) + outputs = np.array(output_ids.cpu().contiguous(), dtype='int32') + np.save(output_file, outputs) + + # Save context logits + if context_logits is not None and output_logits_npy is not None: + context_logits = torch.cat(context_logits, axis=0) + vocab_size_padded = context_logits.shape[-1] + context_logits = context_logits.reshape([1, -1, vocab_size_padded]) + + output_context_logits_npy = output_logits_npy.split( + '.npy')[0] + "_context" + output_context_logits_file = Path(output_context_logits_npy) + context_outputs = np.array( + context_logits.squeeze(0).cpu().contiguous(), + dtype='float32') # [promptLengthSum, vocabSize] + np.save(output_context_logits_file, context_outputs) + + # Save generation logits + if generation_logits is not None and output_logits_npy is not None and num_beams == 1: + output_generation_logits_npy = output_logits_npy.split( + '.npy')[0] + "_generation" + output_generation_logits_file = Path(output_generation_logits_npy) + generation_outputs = np.array(generation_logits.cpu().contiguous(), + dtype='float32') + np.save(output_generation_logits_file, generation_outputs) + + # Save cum log probs + if cum_log_probs is not None and output_cum_log_probs_npy is not None: + cum_log_probs_file = Path(output_cum_log_probs_npy) + cum_log_probs_outputs = np.array(cum_log_probs.cpu().contiguous(), + dtype='float32') + np.save(cum_log_probs_file, cum_log_probs_outputs) + + # Save cum log probs + if log_probs is not None and output_log_probs_npy is not None: + log_probs_file = Path(output_log_probs_npy) + log_probs_outputs = np.array(log_probs.cpu().contiguous(), + dtype='float32') + np.save(log_probs_file, log_probs_outputs) + + +def main(args): + runtime_rank = tensorrt_llm.mpi_rank() + logger.set_level(args.log_level) + + # different handling if encoder-decoder models + import os + is_enc_dec = { + name + for name in os.listdir(args.engine_dir) + if os.path.isdir(os.path.join(args.engine_dir, name)) + } == {'encoder', 'decoder'} + if is_enc_dec: + logger.warning( + "This path is an encoder-decoder model. Using different handling.") + assert not args.use_py_session, "Encoder-decoder models don't have a unified python runtime, please use its own examples/enc_dec/run.py instead." + + model_name, model_version = read_model_name( + args.engine_dir) if not is_enc_dec else ("", "") + if args.tokenizer_dir is None: + logger.warning( + "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect." + ) + args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name] + + tokenizer, pad_id, end_id = load_tokenizer(args, + tokenizer_dir=args.tokenizer_dir, + vocab_file=args.vocab_file, + model_name=model_name, + model_version=model_version, + tokenizer_type=args.tokenizer_type, + ) + + stop_words_list = None + if args.stop_words: + stop_words_list = tensorrt_llm.runtime.decode_words_list( + args.stop_words, tokenizer) + + bad_words_list = None + if args.bad_words: + bad_words_list = tensorrt_llm.runtime.decode_words_list( + args.bad_words, tokenizer) + + prompt_template = None + if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES: + prompt_template = DEFAULT_PROMPT_TEMPLATES[model_name] + batch_input_ids = parse_input(tokenizer=tokenizer, + input_text=args.input_text, + prompt_template=prompt_template, + input_file=args.input_file, + add_special_tokens=args.add_special_tokens, + max_input_length=args.max_input_length, + pad_id=pad_id, + num_prepend_vtokens=args.num_prepend_vtokens, + model_name=model_name, + model_version=model_version) + + if is_enc_dec: + encoder_input_ids = batch_input_ids + decoder_input_ids = [ + torch.tensor([pad_id], dtype=torch.int32) for _ in batch_input_ids + ] # by default decoder_start_token_id for T5 + + input_lengths = [x.size(0) for x in decoder_input_ids + ] if is_enc_dec else [x.size(0) for x in batch_input_ids] + encoder_input_lengths = [x.size(0) + for x in encoder_input_ids] if is_enc_dec else None + + if not PYTHON_BINDINGS and not args.use_py_session: + logger.warning( + "Python bindings of C++ session is unavailable, fallback to Python session." + ) + args.use_py_session = True + if args.debug_mode and not args.use_py_session: + logger.warning( + "Debug mode is not supported in C++ session for now, fallback to Python session." + ) + args.use_py_session = True + runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp + runner_kwargs = dict( + engine_dir=args.engine_dir, + lora_dir=args.lora_dir, + rank=runtime_rank, + debug_mode=args.debug_mode, + lora_ckpt_source=args.lora_ckpt_source, + gpu_weights_percent=args.gpu_weights_percent, + ) + if not args.use_py_session: + runner_kwargs.update(is_enc_dec=is_enc_dec) + if args.medusa_choices is not None: + args.medusa_choices = ast.literal_eval(args.medusa_choices) + assert args.temperature == 1.0, "Medusa should use temperature == 1.0" + assert args.num_beams == 1, "Medusa should use num_beams == 1" + runner_kwargs.update(medusa_choices=args.medusa_choices) + if not args.use_py_session: + runner_kwargs.update( + max_batch_size=len(batch_input_ids), + max_input_len=max( + encoder_input_lengths if is_enc_dec else input_lengths), + max_output_len=args.max_output_len, + max_beam_width=args.num_beams, + max_attention_window_size=args.max_attention_window_size, + sink_token_length=args.sink_token_length, + max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, + kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, + kv_cache_free_gpu_memory_fraction=args. + kv_cache_free_gpu_memory_fraction, + enable_chunked_context=args.enable_chunked_context, + ) + runner = runner_cls.from_dir(**runner_kwargs) + + with torch.no_grad(): + outputs = runner.generate( + batch_input_ids=decoder_input_ids + if is_enc_dec else batch_input_ids, + encoder_input_ids=encoder_input_ids if is_enc_dec else None, + max_new_tokens=args.max_output_len, + max_attention_window_size=args.max_attention_window_size, + sink_token_length=args.sink_token_length, + end_id=end_id, + pad_id=pad_id, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + num_beams=args.num_beams, + length_penalty=args.length_penalty, + early_stopping=args.early_stopping, + repetition_penalty=args.repetition_penalty, + presence_penalty=args.presence_penalty, + frequency_penalty=args.frequency_penalty, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, + lora_uids=args.lora_task_uids, + prompt_table=args.prompt_table_path, + prompt_tasks=args.prompt_tasks, + streaming=args.streaming, + output_sequence_lengths=True, + no_repeat_ngram_size=args.no_repeat_ngram_size, + return_dict=True, + medusa_choices=args.medusa_choices) + torch.cuda.synchronize() + + if args.streaming: + for curr_outputs in throttle_generator(outputs, + args.streaming_interval): + if runtime_rank == 0: + output_ids = curr_outputs['output_ids'] + sequence_lengths = curr_outputs['sequence_lengths'] + cum_log_probs = None + log_probs = None + if args.output_cum_log_probs_npy != None: + cum_log_probs = outputs['cum_log_probs'] + if args.output_log_probs_npy != None: + log_probs = outputs['log_probs'] + print_output( + tokenizer, + output_ids, + input_lengths, + sequence_lengths, + output_csv=args.output_csv, + output_npy=args.output_npy, + cum_log_probs=cum_log_probs, + log_probs=log_probs, + output_cum_log_probs_npy=args.output_cum_log_probs_npy, + output_log_probs_npy=args.output_log_probs_npy) + else: + if runtime_rank == 0: + output_ids = outputs['output_ids'] + sequence_lengths = outputs['sequence_lengths'] + context_logits = None + generation_logits = None + cum_log_probs = None + log_probs = None + if runner.gather_context_logits: + context_logits = outputs['context_logits'] + if runner.gather_generation_logits: + generation_logits = outputs['generation_logits'] + if args.output_cum_log_probs_npy != None: + cum_log_probs = outputs['cum_log_probs'] + if args.output_log_probs_npy != None: + log_probs = outputs['log_probs'] + print_output(tokenizer, + output_ids, + input_lengths, + sequence_lengths, + output_csv=args.output_csv, + output_npy=args.output_npy, + context_logits=context_logits, + generation_logits=generation_logits, + output_logits_npy=args.output_logits_npy, + cum_log_probs=cum_log_probs, + log_probs=log_probs, + output_cum_log_probs_npy=args.output_cum_log_probs_npy, + output_log_probs_npy=args.output_log_probs_npy) + + if args.run_profiling: + ite = 1 + # warmup + for _ in range(ite): + with torch.no_grad(): + outputs = runner.generate( + batch_input_ids, + max_new_tokens=args.max_output_len, + max_attention_window_size=args.max_attention_window_size, + end_id=end_id, + pad_id=pad_id, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + num_beams=args.num_beams, + length_penalty=args.length_penalty, + early_stopping=args.early_stopping, + repetition_penalty=args.repetition_penalty, + presence_penalty=args.presence_penalty, + frequency_penalty=args.frequency_penalty, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != + None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, + lora_uids=args.lora_task_uids, + prompt_table=args.prompt_table_path, + prompt_tasks=args.prompt_tasks, + streaming=args.streaming, + output_sequence_lengths=True, + return_dict=True) + torch.cuda.synchronize() + + # tensorrt_llm.profiler.start("tmp") + for _ in range(ite): + with torch.no_grad(): + power_profile.start() + outputs = runner.generate( + batch_input_ids, + max_new_tokens=args.max_output_len, + max_attention_window_size=args.max_attention_window_size, + end_id=end_id, + pad_id=pad_id, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + num_beams=args.num_beams, + length_penalty=args.length_penalty, + early_stopping=args.early_stopping, + repetition_penalty=args.repetition_penalty, + presence_penalty=args.presence_penalty, + frequency_penalty=args.frequency_penalty, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != + None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, + lora_uids=args.lora_task_uids, + prompt_table=args.prompt_table_path, + prompt_tasks=args.prompt_tasks, + streaming=args.streaming, + output_sequence_lengths=True, + return_dict=True) + training_powers, training_powers_time = power_profile.stop() + power_profile.destroy() + torch.cuda.synchronize() + + list_1 = ["Hardware", + "Num of Hardware", + "Framework", + "Model", + "Input Output Length", + "Batch Size", + "training_powers", + "training_powers_time" + ] + + list_2 = ["Nvidia A100 GPU", + args.tp_size, + "TensorRT-LLM", + args.model_name, + args.max_input_length, + args.batch_size, + list(training_powers), + list(training_powers_time) + ] + + assert len(list_1) == len(list_2) + + import csv, os + + def split_string(model_name): + if "/" in model_name: + return model_name.split("/")[-1] + else: + return model_name + # csv_file = "power_results.csv" + csv_file = "power_" + str(split_string(args.model_name)) + ".csv" + file_exists = os.path.exists(csv_file) + + if runtime_rank == 0: + with open(csv_file, 'a', newline = '') as csvfile: + writer = csv.writer(csvfile) + + if not file_exists: + writer.writerow(list_1) + + writer.writerow(list_2) + + csvfile.close() + + # tensorrt_llm.profiler.stop("tmp") + + # print( + # f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec" + # ) + + # file_path = "/home/krishnat95/llama-bench/Inference/TensorRT-LLM/examples/batch_input.csv" + + # if runtime_rank == 0: + # with open(file_path, 'a', newline='') as file: + # writer = csv.writer(file) + # latency = tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite + # throughput = (args.batch_size*(args.max_input_length + args.max_output_len))/latency + # data = [[args.model_name, str(args.tp_size), str(args.pp_size), str(args.moe_ep_size), str(args.moe_tp_size), str(args.precision), str(args.max_input_length), str(args.max_output_len), str(args.batch_size), str(args.int8_kv_cache), str(latency), str(throughput)]] + # writer.writerows(data) + + + + +import random +import string + + +def generate_random_word(length): + letters = string.ascii_letters + return ''.join(random.choice(letters) for i in range(length)) + +def generate_input(args): + random_words = ["France" for _ in range(args.max_input_length)] + + input_id = "" + + for word in random_words: + input_id = input_id + word + " " + + input_id = input_id[:-1] + + input_list = [] + + for batch_size in range(args.batch_size): + input_list.append(input_id) + + return input_list + + +if __name__ == '__main__': + args = parse_arguments() + args.input_text = generate_input(args) + main(args) + + + + + +power_profile.start() + +training_powers, training_powers_time = power_profile.stop() +power_profile.destroy() + diff --git a/TensorRT-LLM/H100/run_precision_bench.py b/TensorRT-LLM/H100/run_precision_bench.py new file mode 100644 index 0000000..c8bc0f4 --- /dev/null +++ b/TensorRT-LLM/H100/run_precision_bench.py @@ -0,0 +1,563 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import ast +import csv +import os +from pathlib import Path + +import numpy as np +import torch +from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, + add_common_args, load_tokenizer, read_model_name, + throttle_generator) + +import tensorrt_llm +import tensorrt_llm.profiler +from tensorrt_llm.logger import logger +from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner + +if PYTHON_BINDINGS: + from tensorrt_llm.runtime import ModelRunnerCpp + + +def parse_arguments(args=None): + parser = argparse.ArgumentParser() + + parser.add_argument('--pp_size', type=int, default = 1) + parser.add_argument('--tp_size', type=int, default = 1) + parser.add_argument('--moe_ep_size', type=int, default = 1) + parser.add_argument('--moe_tp_size', type=int, default = 1) + parser.add_argument('--model_name', type=str, required=True) + parser.add_argument('--batch_size', type=int, default=1, help='Batch Size') + parser.add_argument('--precision', type=str, default="float16", help="precision") + parser.add_argument('--int8_kv_cache', default=False, action='store_true', help="Int8 KV Cache.") + + parser.add_argument('--qformat', type=str, default="float16", help="precision") + parser.add_argument('--kv_cache_dtype', type=str, default="float16", help="precision") + + parser.add_argument('--max_input_length', type=int, default=923) + parser.add_argument('--max_output_len', type=int, required=True) + parser.add_argument( + '--input_text', + type=str, + nargs='+', + default=["Born in north-east France, Soyer trained as a"]) + parser.add_argument( + '--input_file', + type=str, + help= + 'CSV or Numpy file containing tokenized input. Alternative to text input.', + default=None) + parser.add_argument('--output_csv', + type=str, + help='CSV file where the tokenized output is stored.', + default=None) + parser.add_argument('--output_npy', + type=str, + help='Numpy file where the tokenized output is stored.', + default=None) + parser.add_argument( + '--output_logits_npy', + type=str, + help= + 'Numpy file where the generation logits are stored. Use only when num_beams==1', + default=None) + parser.add_argument('--output_log_probs_npy', + type=str, + help='Numpy file where the log_probs are stored', + default=None) + parser.add_argument('--output_cum_log_probs_npy', + type=str, + help='Numpy file where the cum_log_probs are stored', + default=None) + parser.add_argument( + '--run_profiling', + default=False, + action='store_true', + help="Run several 10 iterations to profile the inference latencies.") + parser = add_common_args(parser) + + return parser.parse_args(args=args) + + +def parse_input(tokenizer, + input_text=None, + prompt_template=None, + input_file=None, + add_special_tokens=True, + max_input_length=923, + pad_id=None, + num_prepend_vtokens=[], + model_name=None, + model_version=None): + if pad_id is None: + pad_id = tokenizer.pad_token_id + + batch_input_ids = [] + if input_file is None: + for curr_text in input_text: + if prompt_template is not None: + curr_text = prompt_template.format(input_text=curr_text) + input_ids = tokenizer.encode(curr_text, + add_special_tokens=add_special_tokens, + truncation=True, + max_length=max_input_length) + batch_input_ids.append(input_ids) + else: + if input_file.endswith('.csv'): + with open(input_file, 'r') as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for line in csv_reader: + input_ids = np.array(line, dtype='int32') + batch_input_ids.append(input_ids[-max_input_length:]) + elif input_file.endswith('.npy'): + inputs = np.load(input_file) + for row in inputs: + input_ids = row[row != pad_id] + batch_input_ids.append(input_ids[-max_input_length:]) + elif input_file.endswith('.txt'): + with open(input_file, 'r', encoding='utf-8', + errors='replace') as txt_file: + input_text = txt_file.readlines() + batch_input_ids = tokenizer( + input_text, + add_special_tokens=add_special_tokens, + truncation=True, + max_length=max_input_length)["input_ids"] + else: + print('Input file format not supported.') + raise SystemExit + + if num_prepend_vtokens: + assert len(num_prepend_vtokens) == len(batch_input_ids) + base_vocab_size = tokenizer.vocab_size - len( + tokenizer.special_tokens_map.get('additional_special_tokens', [])) + for i, length in enumerate(num_prepend_vtokens): + batch_input_ids[i] = list( + range(base_vocab_size, + base_vocab_size + length)) + batch_input_ids[i] + + if model_name == 'ChatGLMForCausalLM' and model_version == 'glm': + for ids in batch_input_ids: + ids.append(tokenizer.sop_token_id) + + batch_input_ids = [ + torch.tensor(x, dtype=torch.int32) for x in batch_input_ids + ] + return batch_input_ids + + +def print_output(tokenizer, + output_ids, + input_lengths, + sequence_lengths, + output_csv=None, + output_npy=None, + context_logits=None, + generation_logits=None, + cum_log_probs=None, + log_probs=None, + output_logits_npy=None, + output_cum_log_probs_npy=None, + output_log_probs_npy=None): + batch_size, num_beams, _ = output_ids.size() + if output_csv is None and output_npy is None: + for batch_idx in range(batch_size): + inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist( + ) + input_text = tokenizer.decode(inputs) + print(f'Input [Text {batch_idx}]: \"{input_text}\"') + for beam in range(num_beams): + output_begin = input_lengths[batch_idx] + output_end = sequence_lengths[batch_idx][beam] + outputs = output_ids[batch_idx][beam][ + output_begin:output_end].tolist() + output_text = tokenizer.decode(outputs) + print( + f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"') + + output_ids = output_ids.reshape((-1, output_ids.size(2))) + if output_csv is not None: + output_file = Path(output_csv) + output_file.parent.mkdir(exist_ok=True, parents=True) + outputs = output_ids.tolist() + with open(output_file, 'w') as csv_file: + writer = csv.writer(csv_file, delimiter=',') + writer.writerows(outputs) + + if output_npy is not None: + output_file = Path(output_npy) + output_file.parent.mkdir(exist_ok=True, parents=True) + outputs = np.array(output_ids.cpu().contiguous(), dtype='int32') + np.save(output_file, outputs) + + # Save context logits + if context_logits is not None and output_logits_npy is not None: + context_logits = torch.cat(context_logits, axis=0) + vocab_size_padded = context_logits.shape[-1] + context_logits = context_logits.reshape([1, -1, vocab_size_padded]) + + output_context_logits_npy = output_logits_npy.split( + '.npy')[0] + "_context" + output_context_logits_file = Path(output_context_logits_npy) + context_outputs = np.array( + context_logits.squeeze(0).cpu().contiguous(), + dtype='float32') # [promptLengthSum, vocabSize] + np.save(output_context_logits_file, context_outputs) + + # Save generation logits + if generation_logits is not None and output_logits_npy is not None and num_beams == 1: + output_generation_logits_npy = output_logits_npy.split( + '.npy')[0] + "_generation" + output_generation_logits_file = Path(output_generation_logits_npy) + generation_outputs = np.array(generation_logits.cpu().contiguous(), + dtype='float32') + np.save(output_generation_logits_file, generation_outputs) + + # Save cum log probs + if cum_log_probs is not None and output_cum_log_probs_npy is not None: + cum_log_probs_file = Path(output_cum_log_probs_npy) + cum_log_probs_outputs = np.array(cum_log_probs.cpu().contiguous(), + dtype='float32') + np.save(cum_log_probs_file, cum_log_probs_outputs) + + # Save cum log probs + if log_probs is not None and output_log_probs_npy is not None: + log_probs_file = Path(output_log_probs_npy) + log_probs_outputs = np.array(log_probs.cpu().contiguous(), + dtype='float32') + np.save(log_probs_file, log_probs_outputs) + + +def main(args): + runtime_rank = tensorrt_llm.mpi_rank() + logger.set_level(args.log_level) + + # different handling if encoder-decoder models + is_enc_dec = { + name + for name in os.listdir(args.engine_dir) + if os.path.isdir(os.path.join(args.engine_dir, name)) + } == {'encoder', 'decoder'} + if is_enc_dec: + logger.warning( + "This path is an encoder-decoder model. Using different handling.") + assert not args.use_py_session, "Encoder-decoder models don't have a unified python runtime, please use its own examples/enc_dec/run.py instead." + + model_name, model_version = read_model_name( + args.engine_dir) if not is_enc_dec else ("", "") + if args.tokenizer_dir is None: + logger.warning( + "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect." + ) + args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name] + + tokenizer, pad_id, end_id = load_tokenizer(args, + tokenizer_dir=args.tokenizer_dir, + vocab_file=args.vocab_file, + model_name=model_name, + model_version=model_version, + tokenizer_type=args.tokenizer_type, + ) + + stop_words_list = None + if args.stop_words: + stop_words_list = tensorrt_llm.runtime.decode_words_list( + args.stop_words, tokenizer) + + bad_words_list = None + if args.bad_words: + bad_words_list = tensorrt_llm.runtime.decode_words_list( + args.bad_words, tokenizer) + + prompt_template = None + if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES: + prompt_template = DEFAULT_PROMPT_TEMPLATES[model_name] + batch_input_ids = parse_input(tokenizer=tokenizer, + input_text=args.input_text, + prompt_template=prompt_template, + input_file=args.input_file, + add_special_tokens=args.add_special_tokens, + max_input_length=args.max_input_length, + pad_id=pad_id, + num_prepend_vtokens=args.num_prepend_vtokens, + model_name=model_name, + model_version=model_version) + + if is_enc_dec: + encoder_input_ids = batch_input_ids + decoder_input_ids = [ + torch.tensor([pad_id], dtype=torch.int32) for _ in batch_input_ids + ] # by default decoder_start_token_id for T5 + + input_lengths = [x.size(0) for x in decoder_input_ids + ] if is_enc_dec else [x.size(0) for x in batch_input_ids] + encoder_input_lengths = [x.size(0) + for x in encoder_input_ids] if is_enc_dec else None + + if not PYTHON_BINDINGS and not args.use_py_session: + logger.warning( + "Python bindings of C++ session is unavailable, fallback to Python session." + ) + args.use_py_session = True + if args.debug_mode and not args.use_py_session: + logger.warning( + "Debug mode is not supported in C++ session for now, fallback to Python session." + ) + args.use_py_session = True + runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp + runner_kwargs = dict( + engine_dir=args.engine_dir, + lora_dir=args.lora_dir, + rank=runtime_rank, + debug_mode=args.debug_mode, + lora_ckpt_source=args.lora_ckpt_source, + gpu_weights_percent=args.gpu_weights_percent, + ) + if not args.use_py_session: + runner_kwargs.update(is_enc_dec=is_enc_dec) + if args.medusa_choices is not None: + args.medusa_choices = ast.literal_eval(args.medusa_choices) + assert args.temperature == 1.0, "Medusa should use temperature == 1.0" + assert args.num_beams == 1, "Medusa should use num_beams == 1" + runner_kwargs.update(medusa_choices=args.medusa_choices) + if not args.use_py_session: + runner_kwargs.update( + max_batch_size=len(batch_input_ids), + max_input_len=max( + encoder_input_lengths if is_enc_dec else input_lengths), + max_output_len=args.max_output_len, + max_beam_width=args.num_beams, + max_attention_window_size=args.max_attention_window_size, + sink_token_length=args.sink_token_length, + max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, + kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, + kv_cache_free_gpu_memory_fraction=args. + kv_cache_free_gpu_memory_fraction, + enable_chunked_context=args.enable_chunked_context, + ) + runner = runner_cls.from_dir(**runner_kwargs) + + with torch.no_grad(): + outputs = runner.generate( + batch_input_ids=decoder_input_ids + if is_enc_dec else batch_input_ids, + encoder_input_ids=encoder_input_ids if is_enc_dec else None, + max_new_tokens=args.max_output_len, + max_attention_window_size=args.max_attention_window_size, + sink_token_length=args.sink_token_length, + end_id=end_id, + pad_id=pad_id, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + num_beams=args.num_beams, + length_penalty=args.length_penalty, + early_stopping=args.early_stopping, + repetition_penalty=args.repetition_penalty, + presence_penalty=args.presence_penalty, + frequency_penalty=args.frequency_penalty, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, + lora_uids=args.lora_task_uids, + prompt_table=args.prompt_table_path, + prompt_tasks=args.prompt_tasks, + streaming=args.streaming, + output_sequence_lengths=True, + no_repeat_ngram_size=args.no_repeat_ngram_size, + return_dict=True, + medusa_choices=args.medusa_choices) + torch.cuda.synchronize() + + if args.streaming: + for curr_outputs in throttle_generator(outputs, + args.streaming_interval): + if runtime_rank == 0: + output_ids = curr_outputs['output_ids'] + sequence_lengths = curr_outputs['sequence_lengths'] + cum_log_probs = None + log_probs = None + if args.output_cum_log_probs_npy != None: + cum_log_probs = outputs['cum_log_probs'] + if args.output_log_probs_npy != None: + log_probs = outputs['log_probs'] + print_output( + tokenizer, + output_ids, + input_lengths, + sequence_lengths, + output_csv=args.output_csv, + output_npy=args.output_npy, + cum_log_probs=cum_log_probs, + log_probs=log_probs, + output_cum_log_probs_npy=args.output_cum_log_probs_npy, + output_log_probs_npy=args.output_log_probs_npy) + else: + if runtime_rank == 0: + output_ids = outputs['output_ids'] + sequence_lengths = outputs['sequence_lengths'] + context_logits = None + generation_logits = None + cum_log_probs = None + log_probs = None + if runner.gather_context_logits: + context_logits = outputs['context_logits'] + if runner.gather_generation_logits: + generation_logits = outputs['generation_logits'] + if args.output_cum_log_probs_npy != None: + cum_log_probs = outputs['cum_log_probs'] + if args.output_log_probs_npy != None: + log_probs = outputs['log_probs'] + print_output(tokenizer, + output_ids, + input_lengths, + sequence_lengths, + output_csv=args.output_csv, + output_npy=args.output_npy, + context_logits=context_logits, + generation_logits=generation_logits, + output_logits_npy=args.output_logits_npy, + cum_log_probs=cum_log_probs, + log_probs=log_probs, + output_cum_log_probs_npy=args.output_cum_log_probs_npy, + output_log_probs_npy=args.output_log_probs_npy) + + if args.run_profiling: + ite = 10 + # warmup + for _ in range(ite): + with torch.no_grad(): + outputs = runner.generate( + batch_input_ids, + max_new_tokens=args.max_output_len, + max_attention_window_size=args.max_attention_window_size, + end_id=end_id, + pad_id=pad_id, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + num_beams=args.num_beams, + length_penalty=args.length_penalty, + early_stopping=args.early_stopping, + repetition_penalty=args.repetition_penalty, + presence_penalty=args.presence_penalty, + frequency_penalty=args.frequency_penalty, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != + None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, + lora_uids=args.lora_task_uids, + prompt_table=args.prompt_table_path, + prompt_tasks=args.prompt_tasks, + streaming=args.streaming, + output_sequence_lengths=True, + return_dict=True) + torch.cuda.synchronize() + + tensorrt_llm.profiler.start("tmp") + ite = 1 + for _ in range(ite): + with torch.no_grad(): + outputs = runner.generate( + batch_input_ids, + max_new_tokens=args.max_output_len, + max_attention_window_size=args.max_attention_window_size, + end_id=end_id, + pad_id=pad_id, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + num_beams=args.num_beams, + length_penalty=args.length_penalty, + early_stopping=args.early_stopping, + repetition_penalty=args.repetition_penalty, + presence_penalty=args.presence_penalty, + frequency_penalty=args.frequency_penalty, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != + None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, + lora_uids=args.lora_task_uids, + prompt_table=args.prompt_table_path, + prompt_tasks=args.prompt_tasks, + streaming=args.streaming, + output_sequence_lengths=True, + return_dict=True) + torch.cuda.synchronize() + tensorrt_llm.profiler.stop("tmp") + + print( + f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec" + ) + + def split_string(model_name): + if "/" in model_name: + return model_name.split("/")[-1] + else: + return model_name + + file_path = "precision_results_" + str(split_string(args.model_name)) + ".csv" + + if runtime_rank == 0: + + latency = tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite + throughput = (args.batch_size*(args.max_input_length + args.max_output_len))/latency + Weight_KV_dtype = f"Weight = {args.qformat}, KV Cache = {args.kv_cache_dtype}" + data = [["Nvidia A100 GPU",str(args.tp_size),"TensorRT-LLM",args.model_name,str(args.max_input_length),str(args.batch_size),Weight_KV_dtype,str(latency),str(throughput)]] + + with open(file_path, 'a', newline='') as file: + writer = csv.writer(file) + writer.writerows(data) + + +import random +import string + + +def generate_random_word(length): + letters = string.ascii_letters + return ''.join(random.choice(letters) for i in range(length)) + +def generate_input(args): + random_words = ["France" for _ in range(args.max_input_length)] + + input_id = "" + + for word in random_words: + input_id = input_id + word + " " + + input_id = input_id[:-1] + + input_list = [] + + for batch_size in range(args.batch_size): + input_list.append(input_id) + + return input_list + + +if __name__ == '__main__': + args = parse_arguments() + args.input_text = generate_input(args) + main(args) diff --git a/TensorRT-LLM/H100/utils.py b/TensorRT-LLM/H100/utils.py new file mode 100644 index 0000000..bf0057f --- /dev/null +++ b/TensorRT-LLM/H100/utils.py @@ -0,0 +1,373 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +from pathlib import Path +from typing import Optional + +from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer + +from tensorrt_llm.bindings import GptJsonConfig +from tensorrt_llm.builder import get_engine_version + +DEFAULT_HF_MODEL_DIRS = { + 'BaichuanForCausalLM': 'baichuan-inc/Baichuan-13B-Chat', + 'BloomForCausalLM': 'bigscience/bloom-560m', + 'GLMModel': 'THUDM/glm-10b', + 'ChatGLMModel': 'THUDM/chatglm3-6b', + 'ChatGLMForCausalLM': 'THUDM/chatglm3-6b', + 'FalconForCausalLM': 'tiiuae/falcon-rw-1b', + 'GPTForCausalLM': 'gpt2-medium', + 'GPTJForCausalLM': 'EleutherAI/gpt-j-6b', + 'GPTNeoXForCausalLM': 'EleutherAI/gpt-neox-20b', + 'InternLMForCausalLM': 'internlm/internlm-chat-7b', + 'InternLM2ForCausalLM': 'internlm/internlm2-chat-7b', + 'LlamaForCausalLM': 'meta-llama/Llama-2-7b-hf', + 'MPTForCausalLM': 'mosaicml/mpt-7b', + 'PhiForCausalLM': 'microsoft/phi-2', + 'OPTForCausalLM': 'facebook/opt-350m', + 'QWenLMHeadModel': 'Qwen/Qwen-7B', + 'QWenForCausalLM': 'Qwen/Qwen-7B', + 'Qwen2ForCausalLM': 'Qwen/Qwen1.5-7B', + 'Qwen2MoeForCausalLM': 'Qwen/Qwen1.5-MoE-A2.7B', + 'RecurrentGemmaForCausalLM': 'google/recurrentgemma-2b', +} + +INTERNLM_META_INSTRUCTION = """You are an AI assistant whose name is InternLM (书生·浦语). +- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless. +- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文. +""" + +QWEN_PROMPT_TEMPLATE = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n" + +DEFAULT_PROMPT_TEMPLATES = { + 'InternLMForCausalLM': "<|User|>:{input_text}\n<|Bot|>:", + 'InternLM2ForCausalLM': "<|im_start|>system\n" + INTERNLM_META_INSTRUCTION + + "<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n", + 'QWenLMHeadModel': QWEN_PROMPT_TEMPLATE, + 'QWenForCausalLM': QWEN_PROMPT_TEMPLATE, + 'Qwen2ForCausalLM': QWEN_PROMPT_TEMPLATE, + 'Qwen2MoeForCausalLM': QWEN_PROMPT_TEMPLATE, +} + + +def supports_inflight_batching(engine_dir): + config_path = Path(engine_dir) / "config.json" + json_config = GptJsonConfig.parse_file(config_path) + model_config = json_config.model_config + return model_config.supports_inflight_batching + + +def read_decoder_start_token_id(engine_dir): + with open(Path(engine_dir) / "config.json", 'r') as f: + config = json.load(f) + return config['pretrained_config']['decoder_start_token_id'] + + +def read_model_name(engine_dir: str): + engine_version = get_engine_version(engine_dir) + + with open(Path(engine_dir) / "config.json", 'r') as f: + config = json.load(f) + + if engine_version is None: + return config['builder_config']['name'], None + + model_arch = config['pretrained_config']['architecture'] + model_version = None + if 'GLM' in model_arch: + model_version = config['pretrained_config']['chatglm_version'] + if 'qwen' in model_arch.lower(): + model_version = config['pretrained_config']['qwen_type'] + return model_arch, model_version + + +def throttle_generator(generator, stream_interval): + for i, out in enumerate(generator): + if not i % stream_interval: + yield out + + if i % stream_interval: + yield out + + +def load_tokenizer(args, tokenizer_dir: Optional[str] = None, + vocab_file: Optional[str] = None, + model_name: str = 'GPTForCausalLM', + model_version: Optional[str] = None, + tokenizer_type: Optional[str] = None): + if vocab_file is None: + use_fast = True + if tokenizer_type is not None and tokenizer_type == "llama": + use_fast = False + # Should set both padding_side and truncation_side to be 'left' + if "Llama-3-8B" in args.model_name: + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", + cache_dir = "/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6", + legacy=False, + padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False + ) + + else: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, + legacy=False, + padding_side='left', + truncation_side='left', + trust_remote_code=True, + tokenizer_type=tokenizer_type, + use_fast=use_fast) + elif model_name == 'GemmaForCausalLM' or model_name == 'RecurrentGemmaForCausalLM': + from transformers import GemmaTokenizer + + # Initialize tokenizer from vocab file. + tokenizer = GemmaTokenizer(vocab_file=vocab_file, + padding_side='left', + truncation_side='left', + legacy=False) + elif model_name == 'Grok1ModelForCausalLM': + tokenizer = LlamaTokenizer(vocab_file=vocab_file, + padding_side='left', + truncation_side='left', + legacy=False, + use_fast=False) + else: + # For gpt-next, directly load from tokenizer.model + tokenizer = T5Tokenizer(vocab_file=vocab_file, + padding_side='left', + truncation_side='left', + legacy=False) + if 'qwen' in model_name.lower() and model_version == 'qwen': + with open(Path(tokenizer_dir) / "generation_config.json") as f: + gen_config = json.load(f) + pad_id = gen_config['pad_token_id'] + end_id = gen_config['eos_token_id'] + elif 'GLM' in model_name and model_version == 'glm': + pad_id = tokenizer.pad_token_id + end_id = tokenizer.eop_token_id + else: + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + pad_id = tokenizer.pad_token_id + end_id = tokenizer.eos_token_id + + return tokenizer, pad_id, end_id + + +def add_common_args(parser): + # sampling arguments + parser.add_argument('--num_beams', + type=int, + help="Use beam search if num_beams > 1", + default=1) + parser.add_argument('--temperature', type=float, default=1.0) + parser.add_argument('--top_k', type=int, default=1) + parser.add_argument('--top_p', type=float, default=0.0) + parser.add_argument('--length_penalty', type=float, default=1.0) + parser.add_argument('--repetition_penalty', type=float, default=1.0) + parser.add_argument('--presence_penalty', type=float, default=0.0) + parser.add_argument('--frequency_penalty', type=float, default=0.0) + parser.add_argument('--beam_search_diversity_rate', type=float, default=0.0) + parser.add_argument('--random_seed', type=int, default=0) + parser.add_argument('--early_stopping', + type=int, + help='Use early stopping if num_beams > 1' + '1 for early-stopping, 0 for non-early-stopping' + 'other values for stopping by length', + default=1) + parser.add_argument( + '--end_id', + default=None, + type=int, + help="Override tokenizer end_id to stop on given end_id token.") + parser.add_argument( + '--stop_words', + default=None, + type=str, + nargs="+", + action='append', + help= + 'Set stop words for a batch. Successive invocations of --stop_words set stop words for other batches.' + ' E.g.: --stop_words " London" " chef" --stop_words "eventually became" "was not"', + ) + parser.add_argument( + '--bad_words', + default=None, + type=str, + nargs="+", + action='append', + help= + 'Set bad words for a batch. Successive invocations of --bad_words set bad words for other batches.' + ' E.g.: --bad_words " London" " chef" --bad_words "eventually became" "was not"', + ) + parser.add_argument('--no_repeat_ngram_size', type=int, default=None) + + # common runtime arguments + parser.add_argument('--sink_token_length', + type=int, + default=None, + help='The sink token length.') + parser.add_argument( + '--max_attention_window_size', + type=int, + default=None, + help= + 'The attention window size that controls the sliding window attention / cyclic kv cache behavior' + ) + parser.add_argument( + '--multi_block_mode', + action='store_true', + help= + "Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel." + ) + parser.add_argument('--log_level', type=str, default='info') + parser.add_argument( + '--no_prompt_template', + dest='use_prompt_template', + default=True, + action='store_false', + help= + "Whether or not to use default prompt template to wrap the input text.") + parser.add_argument('--use_py_session', + default=False, + action='store_true', + help="Whether or not to use Python runtime session") + parser.add_argument('--debug_mode', + default=False, + action='store_true', + help="Whether or not to turn on the debug mode") + parser.add_argument('--streaming', default=False, action='store_true') + parser.add_argument('--streaming_interval', + type=int, + help="How often to return tokens when streaming.", + default=5) + parser.add_argument( + '--prompt_table_path', + type=str, + help="Path to .npy file, exported by nemo_prompt_convert.py") + parser.add_argument( + '--prompt_tasks', + help="Comma-separated list of tasks for prompt tuning, e.g., 0,3,1,0") + parser.add_argument('--lora_dir', + type=str, + default=None, + nargs="+", + help="The directory of LoRA weights") + parser.add_argument('--lora_ckpt_source', + type=str, + default="hf", + choices=["hf", "nemo"], + help="The source of lora checkpoint.") + parser.add_argument( + '--lora_task_uids', + type=str, + default=None, + nargs="+", + help="The list of LoRA task uids; use -1 to disable the LoRA module") + parser.add_argument( + '--num_prepend_vtokens', + nargs="+", + type=int, + help="Number of (default) virtual tokens to prepend to each sentence." + " For example, '--num_prepend_vtokens=10' will prepend the tokens" + " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.") + parser.add_argument( + '--medusa_choices', + type=str, + default=None, + help="Medusa choice to use, if not none, will use Medusa decoding." + " E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens." + ) + + # model arguments + parser.add_argument('--engine_dir', type=str, default='engine_outputs') + parser.add_argument( + '--tokenizer_type', + help= + 'Specify that argument when providing a .model file as the tokenizer_dir. ' + 'It allows AutoTokenizer to instantiate the correct tokenizer type.') + parser.add_argument('--vocab_file', + help="Used for sentencepiece tokenizers") + parser.add_argument('--no_add_special_tokens', + dest='add_special_tokens', + default=True, + action='store_false', + help="Whether or not to add special tokens") + parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None) + parser.add_argument( + '--tokenizer_dir', + default=None, + help='tokenizer path; defaults to hf_model_dir if left unspecified') + + # memory argument + parser.add_argument( + '--gpu_weights_percent', + default=1, + type=float, + help= + 'Specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime.', + ) + parser.add_argument( + '--max_tokens_in_paged_kv_cache', + default=None, + type=int, + help= + 'Specify the maximum number of tokens in a kv cache page (only available with cpp session).', + ) + parser.add_argument( + '--kv_cache_enable_block_reuse', + action='store_true', + help= + 'Enables block reuse in kv cache (only available with cpp session).', + ) + parser.add_argument( + '--kv_cache_free_gpu_memory_fraction', + default=0.9, + type=float, + help='Specify the free gpu memory fraction.', + ) + parser.add_argument( + '--enable_chunked_context', + action='store_true', + help='Enables chunked context (only available with cpp session).', + ) + + # hf model argument (if use hf model) + parser.add_argument( + '--hf_data_type', + '--data_type', + type=str, + choices=['fp32', 'fp16', 'bf16', 'float32', 'float16', 'bfloat16'], + default='fp16', + help="The data type for hf model.") + parser.add_argument( + '--hf_device_map_auto', + action='store_true', + help="Use device map 'auto' to load a pretrained HF model. This may " + "help to test a large model that cannot fit into a singlue GPU.") + + parser.add_argument( + "--return_all_generated_tokens", + default=False, + action="store_true", + help="This option changes the token output only for streaming. " + "If not specified, return only generated tokens at each step. " + "If specified, return the full beams/outputs at each step. " + "It is automatically enabled for num_beams>1 (only available with cpp session). " + "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)." + ) + + return parser