Wordcab · aleksandr-smechov · Jun 12, 2024 · Jun 12, 2024
diff --git a/.env b/.env
@@ -41,6 +41,10 @@ WHISPER_MODEL="large-v3"
 # You can specify one of two engines, "faster-whisper" or "tensorrt-llm". At the moment, "faster-whisper" is more
 # stable, adjustable, and accurate, while "tensorrt-llm" is faster but less accurate and adjustable.
 WHISPER_ENGINE="tensorrt-llm"
+# This helps adjust some build during the conversion of the Whisper model to TensorRT. If you change this, be sure to
+# it in pre_requirements.txt. The only available options are "0.9.0.dev2024032600" and "0.11.0.dev2024052100".
+# Note that version "0.11.0.dev2024052100" is not compatible with T4 or V100 GPUs.
+TENSORRT_LLM_VERSION="0.9.0.dev2024032600"
 # The align model is used for aligning timestamps under the "tensorrt-llm" engine. The available options are:
 # "tiny", "small", "base", or "medium".
 ALIGN_MODEL="tiny"
@@ -60,7 +64,7 @@ TOKENIZERS_PARALLELISM=False
 # The diarization_backend parameter is used to control the diarization model used. The available options are:
 # "longform-diarizer" or "default-diarizer". It's suggested to use "default-diarizer" for better stability.
 # The "longform-diarizer" is still being developed.
-DIARIZATION_BACKEND="default-diarizer"
+DIARIZATION_BACKEND="longform-diarizer"
 # In a MSDD (Multiscale Diarization Decoder) model, the diarization model is trained on multiple window lengths.
 # The window_lengths are specified in seconds, and separated by a comma. If not specified, the default value will
 # be "1.5, 1.25, 1.0, 0.75, 0.5".
@@ -97,7 +101,7 @@ ASR_TYPE="async"
 #
 # Include the cortex endpoint in the API. This endpoint is used to process audio files from the Cortex API.
 # Use this only if you deploy the API using Cortex and Kubernetes.
-CORTEX_ENDPOINT=True
+CORTEX_ENDPOINT=False
 #
 # ---------------------------------------- API AUTHENTICATION CONFIGURATION ------------------------------------------ #
 # The API authentication is used to control the access to the API endpoints.
@@ -130,6 +134,7 @@ SVIX_APP_ID=
 #
 # ----------------------------------------------- AWS CONFIGURATION ------------------------------------------------- #
 #
+SEND_RESULTS_TO_S3=False
 AWS_ACCESS_KEY_ID=
 AWS_SECRET_ACCESS_KEY=
 AWS_STORAGE_BUCKET_NAME=

diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,7 @@ test.ipynb
 test.py
 whisper_model
 whisper_model_he
+storage
+nemo_storage
+nemo_local
+error.log
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
diff --git a/error.log b/error.log
@@ -0,0 +1 @@
+/app/temp_outputs/mono_file.wav:[Errno 2] No such file or directory: '/app/temp_outputs/mono_file.wav'
diff --git a/nemo_local b/nemo_local
diff --git a/nemo_storage/infer_manifest.json b/nemo_storage/infer_manifest.json
@@ -0,0 +1 @@
+{"audio_filepath": "/app/temp_outputs/mono_file.wav", "offset": 0, "duration": null, "label": "infer", "text": "-", "rttm_filepath": null, "uem_filepath": null}
diff --git a/pre_requirements.txt b/pre_requirements.txt
@@ -6,7 +6,7 @@ shortuuid==1.0.13
 svix==1.21.0
 uvicorn==0.29.0
 websockets==12.0
-tensorrt_llm==0.11.0.dev2024052100
+tensorrt_llm==0.9.0.dev2024032600
 Cython==3.0.10
 youtokentome @ git+https://github.com/gburlet/YouTokenToMe.git@dependencies
 deepmultilingualpunctuation==1.0.1

diff --git a/src/wordcab_transcribe/config.py b/src/wordcab_transcribe/config.py
@@ -69,6 +69,7 @@ class Settings:
     # Cortex configuration
     cortex_api_key: str
     # AWS configuration
+    send_results_to_s3: bool
     aws_access_key_id: str
     aws_secret_access_key: str
     aws_storage_bucket_name: str
@@ -137,7 +138,7 @@ def align_model_compatibility_check(cls, value: str):  # noqa: B902, N805
         """Check that the whisper engine is compatible."""
         if value.lower() not in ["tiny", "small", "base", "medium"]:
             raise ValueError(
-                "The whisper engine must be one of `tiny`, `small`, `base`, or"
+                "The align model must be one of `tiny`, `small`, `base`, or"
                 " `medium`."
             )
 
@@ -348,6 +349,7 @@ def __post_init__(self):
     # Cortex configuration
     cortex_api_key=getenv("WORDCAB_TRANSCRIBE_API_KEY", ""),
     # AWS configuration
+    send_results_to_s3=getenv("SEND_RESULTS_TO_S3", False),
     aws_access_key_id=getenv("AWS_ACCESS_KEY_ID", ""),
     aws_secret_access_key=getenv("AWS_SECRET_ACCESS_KEY", ""),
     aws_storage_bucket_name=getenv("AWS_STORAGE_BUCKET_NAME", ""),

diff --git a/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py b/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py
@@ -12,25 +12,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
 import os
 import time
+import argparse
 
-import tensorrt_llm
 import torch
+from loguru import logger
+
+import tensorrt_llm
 from tensorrt_llm import str_dtype_to_torch, str_dtype_to_trt
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.functional import LayerNormPositionType, LayerNormType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.quantization.quantize_by_modelopt import quantize_model
 from weight import load_decoder_weight, load_encoder_weight
 
 MODEL_ENCODER_NAME = "whisper_encoder"
 MODEL_DECODER_NAME = "whisper_decoder"
 
+TENSORRT_LLM_VERSION = os.getenv("TENSORRT_LLM_VERSION")
+if "0.9.0" in TENSORRT_LLM_VERSION:
+    from tensorrt_llm.models import quantize_model
+elif "0.11.0" in TENSORRT_LLM_VERSION:
+    from tensorrt_llm.quantization.quantize_by_modelopt import quantize_model
+else:
+    raise ValueError(f"Unsupported version of tensorrt_llm: {TENSORRT_LLM_VERSION}")
 
 def get_engine_name(model, dtype, tp_size=1, rank=0):
     return "{}_{}_tp{}_rank{}.engine".format(model, dtype, tp_size, rank)
@@ -79,7 +87,7 @@ def parse_arguments():
     parser.add_argument("--quantize_dir", type=str, default="quantize/1-gpu")
     parser.add_argument("--dtype", type=str, default="float16", choices=["float16"])
     parser.add_argument("--log_level", type=str, default="info")
-    parser.add_argument("--max_batch_size", type=int, default=24)
+    parser.add_argument("--max_batch_size", type=int, default=16)
     parser.add_argument("--max_input_len", type=int, default=4)
     parser.add_argument("--max_output_len", type=int, default=448)
     parser.add_argument("--max_beam_width", type=int, default=1)
@@ -315,32 +323,63 @@ def build_decoder(model, args):
         int8=args.quant_mode.has_act_or_weight_quant(),
     )
 
-    tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel(
-        tensorrt_llm.models.modeling_utils.PretrainedConfig(
-            architecture="whisper",
-            dtype=str_dtype_to_trt(args.dtype),
-            logits_dtype=str_dtype_to_trt(args.dtype),
-            vocab_size=model_metadata["n_vocab"],
-            max_position_embeddings=model_metadata["n_text_ctx"],
-            hidden_size=model_metadata["n_text_state"],
-            num_hidden_layers=model_metadata["n_text_layer"],
-            num_attention_heads=model_metadata["n_text_head"],
-            num_key_value_heads=model_metadata["n_text_head"],
-            hidden_act="gelu",
-            intermediate_size=4 * model_metadata["n_text_state"],
-            norm_epsilon=1e-5,
-            position_embedding_type="learned_absolute",
-            world_size=1,
-            tp_size=1,
-            pp_size=1,
-            gpus_per_node=1,
-            quantization=tensorrt_llm.models.modeling_utils.QuantConfig(),
-            head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
+    try:
+        tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel(
+            tensorrt_llm.models.modeling_utils.PretrainedConfig(
+                architecture="whisper",
+                dtype=str_dtype_to_trt(args.dtype),
+                logits_dtype=str_dtype_to_trt(args.dtype),
+                vocab_size=model_metadata["n_vocab"],
+                max_position_embeddings=model_metadata["n_text_ctx"],
+                hidden_size=model_metadata["n_text_state"],
+                num_hidden_layers=model_metadata["n_text_layer"],
+                num_attention_heads=model_metadata["n_text_head"],
+                num_key_value_heads=model_metadata["n_text_head"],
+                hidden_act="gelu",
+                intermediate_size=4 * model_metadata["n_text_state"],
+                norm_epsilon=1e-5,
+                position_embedding_type="learned_absolute",
+                world_size=1,
+                tp_size=1,
+                pp_size=1,
+                gpus_per_node=1,
+                quantization=tensorrt_llm.models.modeling_utils.QuantConfig(),
+                head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
+                num_layers=model_metadata["n_text_layer"],
+                num_heads=model_metadata["n_text_head"],
+                ffn_hidden_size=4 * model_metadata["n_text_state"],
+                encoder_hidden_size=model_metadata["n_text_state"],
+                encoder_num_heads=model_metadata["n_text_head"],
+                has_position_embedding=True,
+                relative_attention=False,
+                max_distance=0,
+                num_buckets=0,
+                has_embedding_layernorm=False,
+                has_embedding_scale=False,
+                q_scaling=1.0,
+                has_attention_qkvo_bias=True,
+                has_mlp_bias=True,
+                has_model_final_layernorm=True,
+                layernorm_eps=1e-5,
+                layernorm_position=LayerNormPositionType.pre_layernorm,
+                layernorm_type=LayerNormType.LayerNorm,
+                rescale_before_lm_head=False,
+                encoder_head_size=model_metadata["n_text_state"]
+                // model_metadata["n_text_head"],  # Added missing variable
+                skip_cross_qkv=False,
+            )
+        )
+    except:
+        tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel(
             num_layers=model_metadata["n_text_layer"],
             num_heads=model_metadata["n_text_head"],
+            hidden_size=model_metadata["n_text_state"],
             ffn_hidden_size=4 * model_metadata["n_text_state"],
             encoder_hidden_size=model_metadata["n_text_state"],
             encoder_num_heads=model_metadata["n_text_head"],
+            vocab_size=model_metadata["n_vocab"],
+            head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
+            max_position_embeddings=model_metadata["n_text_ctx"],
             has_position_embedding=True,
             relative_attention=False,
             max_distance=0,
@@ -354,12 +393,11 @@ def build_decoder(model, args):
             layernorm_eps=1e-5,
             layernorm_position=LayerNormPositionType.pre_layernorm,
             layernorm_type=LayerNormType.LayerNorm,
+            hidden_act="gelu",
             rescale_before_lm_head=False,
-            encoder_head_size=model_metadata["n_text_state"]
-            // model_metadata["n_text_head"],  # Added missing variable
-            skip_cross_qkv=False,
+            dtype=str_dtype_to_trt(args.dtype),
+            logits_dtype=str_dtype_to_trt(args.dtype),
         )
-    )
 
     if args.use_weight_only:
         tensorrt_llm_whisper_decoder = quantize_model(
@@ -394,7 +432,10 @@ def build_decoder(model, args):
             model_metadata["n_audio_ctx"],
         )
 
-        tensorrt_llm_whisper_decoder(**inputs)
+        if "0.9.0" in TENSORRT_LLM_VERSION:
+            tensorrt_llm_whisper_decoder(*inputs)
+        else:
+            tensorrt_llm_whisper_decoder(**inputs)
 
         if args.debug_mode:
             for k, v in tensorrt_llm_whisper_decoder.named_network_outputs():

diff --git a/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/create_trt_model.py b/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/create_trt_model.py
@@ -63,14 +63,31 @@
 }
 
 
+TRT_BUILD_MAX_OUTPUT_LEN = os.getenv("TRT_BUILD_MAX_OUTPUT_LEN", None)
+TRT_BUILD_MAX_BEAM_WIDTH = os.getenv("TRT_BUILD_MAX_BEAM_WIDTH", None)
+if not TRT_BUILD_MAX_OUTPUT_LEN:
+    TRT_BUILD_MAX_OUTPUT_LEN = 448
+else:
+    TRT_BUILD_MAX_OUTPUT_LEN = int(TRT_BUILD_MAX_OUTPUT_LEN)
+logger.info(f"TRT_BUILD_MAX_OUTPUT_LEN: {TRT_BUILD_MAX_OUTPUT_LEN}")
+
+if not TRT_BUILD_MAX_BEAM_WIDTH:
+    TRT_BUILD_MAX_BEAM_WIDTH = 1
+else:
+    TRT_BUILD_MAX_BEAM_WIDTH = int(TRT_BUILD_MAX_BEAM_WIDTH)
+logger.info(f"TRT_BUILD_MAX_BEAM_WIDTH: {TRT_BUILD_MAX_BEAM_WIDTH}")
+
+
 def build_whisper_trt_model(
     output_dir,
     use_gpt_attention_plugin=True,
     use_gemm_plugin=True,
     use_bert_attention_plugin=True,
     enable_context_fmha=True,
     use_weight_only=False,
-    model_name="distil-large-v2",
+    max_output_len=TRT_BUILD_MAX_OUTPUT_LEN,
+    max_beam_width=TRT_BUILD_MAX_BEAM_WIDTH,
+    model_name="large-v3",
 ):
     """
     Build a Whisper model using the specified configuration.
@@ -158,6 +175,10 @@ def build_whisper_trt_model(
             command.append("--enable_context_fmha")
         if use_weight_only:
             command.append("--use_weight_only")
+        if max_output_len:
+            command.extend(["--max_output_len", str(max_output_len)])
+        if max_beam_width:
+            command.extend(["--max_beam_width", str(max_beam_width)])
 
         try:
             subprocess.run(command, check=True)

diff --git a/src/wordcab_transcribe/engines/tensorrt_llm/model.py b/src/wordcab_transcribe/engines/tensorrt_llm/model.py
@@ -39,11 +39,11 @@ def exact_div(x, y):
     "best_of": 5,
     "patience": 1,
     "length_penalty": 1,
-    "repetition_penalty": 1.01,
+    "repetition_penalty": 1.05,
     "no_repeat_ngram_size": 0,
     "compression_ratio_threshold": 2.4,
     "log_prob_threshold": -1.0,
-    "no_speech_threshold": 0.3,
+    "no_speech_threshold": 0.4,
     "prefix": None,
     "suppress_blank": False,
     "suppress_tokens": [-1],
@@ -61,7 +61,7 @@ def exact_div(x, y):
     "best_of": 1,
     "patience": 2,
     "length_penalty": 1,
-    "repetition_penalty": 1.01,
+    "repetition_penalty": 1.05,
     "no_repeat_ngram_size": 0,
     "compression_ratio_threshold": 2.4,
     "log_prob_threshold": -1.0,
@@ -224,7 +224,9 @@ def align_words(
                 start_seq_wise_req[_sot_seq] = [_idx]
 
         token_alignments = [[] for _ in seg_metadata]
+
         for start_seq, req_idx in start_seq_wise_req.items():
+
             res = self.align_model.align(
                 ctranslate2.StorageView.from_array(features[req_idx]),
                 start_sequence=list(start_seq),

diff --git a/src/wordcab_transcribe/engines/tensorrt_llm/trt_model.py b/src/wordcab_transcribe/engines/tensorrt_llm/trt_model.py
@@ -93,8 +93,8 @@ def get_session(self, engine_dir, runtime_mapping, debug_mode=False):
 
         # TODO: Make dynamic max_batch_size and max_beam_width
         decoder_model_config = ModelConfig(
-            max_batch_size=24,
-            max_beam_width=1,
+            max_batch_size=16,
+            max_beam_width=5,
             num_heads=self.decoder_config["num_heads"],
             num_kv_heads=self.decoder_config["num_heads"],
             hidden_size=self.decoder_config["hidden_size"],

diff --git a/src/wordcab_transcribe/models.py b/src/wordcab_transcribe/models.py
@@ -403,6 +403,7 @@ class BaseRequest(BaseModel):
     diarization: bool = False
     batch_size: int = 1
     source_lang: str = "en"
+    num_beams: int = 1
     timestamps: Timestamps = Timestamps.seconds
     vocab: Union[List[str], None] = None
     word_timestamps: bool = False
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		/app/temp_outputs/mono_file.wav:[Errno 2] No such file or directory: '/app/temp_outputs/mono_file.wav'
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"audio_filepath": "/app/temp_outputs/mono_file.wav", "offset": 0, "duration": null, "label": "infer", "text": "-", "rttm_filepath": null, "uem_filepath": null}