From 2de2b1d29ccf967b98debc84ccf1f6f28552ba29 Mon Sep 17 00:00:00 2001 From: Aleks Date: Mon, 1 Apr 2024 20:24:24 -0400 Subject: [PATCH] Adjusting max batch size in order to build TRT models on smaller GPUs --- .../engines/tensorrt_llm/engine_builder/build.py | 2 +- src/wordcab_transcribe/engines/tensorrt_llm/trt_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py b/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py index acd7ca3..1106a2d 100644 --- a/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py +++ b/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py @@ -79,7 +79,7 @@ def parse_arguments(): parser.add_argument("--quantize_dir", type=str, default="quantize/1-gpu") parser.add_argument("--dtype", type=str, default="float16", choices=["float16"]) parser.add_argument("--log_level", type=str, default="info") - parser.add_argument("--max_batch_size", type=int, default=64) + parser.add_argument("--max_batch_size", type=int, default=24) parser.add_argument("--max_input_len", type=int, default=4) parser.add_argument("--max_output_len", type=int, default=448) parser.add_argument("--max_beam_width", type=int, default=1) diff --git a/src/wordcab_transcribe/engines/tensorrt_llm/trt_model.py b/src/wordcab_transcribe/engines/tensorrt_llm/trt_model.py index e2c296f..3295a39 100644 --- a/src/wordcab_transcribe/engines/tensorrt_llm/trt_model.py +++ b/src/wordcab_transcribe/engines/tensorrt_llm/trt_model.py @@ -93,7 +93,7 @@ def get_session(self, engine_dir, runtime_mapping, debug_mode=False): # TODO: Make dynamic max_batch_size and max_beam_width decoder_model_config = ModelConfig( - max_batch_size=64, + max_batch_size=24, max_beam_width=1, num_heads=self.decoder_config["num_heads"], num_kv_heads=self.decoder_config["num_heads"],