Adding batch_size option to transcription call

Wordcab · Apr 1, 2024 · 0c6639e · 0c6639e
1 parent 5aa8f3a
commit 0c6639e
Show file tree

Hide file tree

Showing 8 changed files with 24 additions and 5 deletions.
diff --git a/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py b/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py
@@ -79,7 +79,7 @@ def parse_arguments():
     parser.add_argument("--quantize_dir", type=str, default="quantize/1-gpu")
     parser.add_argument("--dtype", type=str, default="float16", choices=["float16"])
     parser.add_argument("--log_level", type=str, default="info")
-    parser.add_argument("--max_batch_size", type=int, default=24)
+    parser.add_argument("--max_batch_size", type=int, default=64)
     parser.add_argument("--max_input_len", type=int, default=4)
     parser.add_argument("--max_output_len", type=int, default=448)
     parser.add_argument("--max_beam_width", type=int, default=1)

diff --git a/src/wordcab_transcribe/engines/tensorrt_llm/trt_model.py b/src/wordcab_transcribe/engines/tensorrt_llm/trt_model.py
@@ -93,7 +93,7 @@ def get_session(self, engine_dir, runtime_mapping, debug_mode=False):
 
         # TODO: Make dynamic max_batch_size and max_beam_width
         decoder_model_config = ModelConfig(
-            max_batch_size=24,
+            max_batch_size=64,
             max_beam_width=1,
             num_heads=self.decoder_config["num_heads"],
             num_kv_heads=self.decoder_config["num_heads"],

diff --git a/src/wordcab_transcribe/models.py b/src/wordcab_transcribe/models.py
@@ -122,6 +122,7 @@ class Config:
                     },
                 ],
                 "audio_duration": 2.678,
+                "batch_size": 1,
                 "offset_start": None,
                 "offset_end": None,
                 "num_speakers": -1,
@@ -176,6 +177,7 @@ class Config:
                     },
                 ],
                 "audio_duration": 2.0,
+                "batch_size": 1,
                 "offset_start": None,
                 "offset_end": None,
                 "num_speakers": -1,
@@ -399,6 +401,7 @@ class BaseRequest(BaseModel):
     offset_end: Union[float, None] = None
     num_speakers: int = -1
     diarization: bool = False
+    batch_size: int = 1
     source_lang: str = "en"
     timestamps: Timestamps = Timestamps.seconds
     vocab: Union[List[str], None] = None
@@ -462,6 +465,7 @@ class Config:
 
         json_schema_extra = {
             "example": {
+                "batch_size": 1,
                 "offset_start": None,
                 "offset_end": None,
                 "num_speakers": -1,

diff --git a/src/wordcab_transcribe/router/v1/audio_file_endpoint.py b/src/wordcab_transcribe/router/v1/audio_file_endpoint.py
@@ -45,6 +45,7 @@
 )
 async def inference_with_audio(  # noqa: C901
     background_tasks: BackgroundTasks,
+    batch_size: Union[int, None] = Form(None),  # noqa: B008
     offset_start: Union[float, None] = Form(None),  # noqa: B008
     offset_end: Union[float, None] = Form(None),  # noqa: B008
     num_speakers: int = Form(-1),  # noqa: B008
@@ -77,6 +78,7 @@ async def inference_with_audio(  # noqa: C901
         offset_end=offset_end,
         num_speakers=num_speakers,
         diarization=diarization,
+        batch_size=batch_size,
         source_lang=source_lang,
         timestamps=timestamps,
         vocab=vocab,
@@ -115,6 +117,7 @@ async def inference_with_audio(  # noqa: C901
             offset_end=data.offset_end,
             num_speakers=data.num_speakers,
             diarization=data.diarization,
+            batch_size=data.batch_size,
             multi_channel=data.multi_channel,
             source_lang=data.source_lang,
             timestamps_format=data.timestamps,
@@ -147,6 +150,7 @@ async def inference_with_audio(  # noqa: C901
             offset_end=data.offset_end,
             num_speakers=data.num_speakers,
             diarization=data.diarization,
+            batch_size=batch_size,
             multi_channel=data.multi_channel,
             source_lang=data.source_lang,
             timestamps=data.timestamps,

diff --git a/src/wordcab_transcribe/router/v1/audio_url_endpoint.py b/src/wordcab_transcribe/router/v1/audio_url_endpoint.py
@@ -107,6 +107,7 @@ async def process_audio():
                         offset_end=data.offset_end,
                         num_speakers=data.num_speakers,
                         diarization=data.diarization,
+                        batch_size=data.batch_size,
                         multi_channel=data.multi_channel,
                         source_lang=data.source_lang,
                         timestamps_format=data.timestamps,
@@ -130,6 +131,7 @@ async def process_audio():
                     offset_end=data.offset_end,
                     num_speakers=data.num_speakers,
                     diarization=data.diarization,
+                    batch_size=data.batch_size,
                     multi_channel=data.multi_channel,
                     source_lang=data.source_lang,
                     timestamps=data.timestamps,

diff --git a/src/wordcab_transcribe/router/v1/youtube_endpoint.py b/src/wordcab_transcribe/router/v1/youtube_endpoint.py
@@ -58,6 +58,7 @@ async def inference_with_youtube(
                 offset_end=data.offset_end,
                 num_speakers=data.num_speakers,
                 diarization=data.diarization,
+                batch_size=data.batch_size,
                 multi_channel=False,
                 source_lang=data.source_lang,
                 timestamps_format=data.timestamps,
@@ -90,6 +91,7 @@ async def inference_with_youtube(
             offset_end=data.offset_end,
             num_speakers=data.num_speakers,
             diarization=data.diarization,
+            batch_size=data.batch_size,
             source_lang=data.source_lang,
             timestamps=data.timestamps,
             vocab=data.vocab,

diff --git a/src/wordcab_transcribe/services/asr_service.py b/src/wordcab_transcribe/services/asr_service.py
@@ -98,6 +98,7 @@ class ASRTask(BaseModel):
     url_type: Union[str, None]
     diarization: "DiarizationTask"
     duration: float
+    batch_size: int
     multi_channel: bool
     offset_start: Union[float, None]
     post_processing: "PostProcessingTask"
@@ -278,7 +279,7 @@ def __init__(
         self.local_services: LocalServiceRegistry = LocalServiceRegistry()
         self.remote_services: RemoteServiceRegistry = RemoteServiceRegistry()
         self.dual_channel_transcribe_options: dict = {
-            "beam_size": 5,
+            "beam_size": 1,
             "patience": 1,
             "length_penalty": 1,
             "suppress_blank": False,
@@ -366,6 +367,7 @@ async def inference_warmup(self) -> None:
             logger.info(f"Warmup GPU {gpu_index}.")
             await self.process_input(
                 filepath=str(sample_path),
+                batch_size=1,
                 offset_start=None,
                 offset_end=None,
                 num_speakers=1,
@@ -386,6 +388,7 @@ async def inference_warmup(self) -> None:
     async def process_input(  # noqa: C901
         self,
         filepath: Union[str, List[str]],
+        batch_size: Union[int, None],
         offset_start: Union[float, None],
         offset_end: Union[float, None],
         num_speakers: int,
@@ -415,6 +418,8 @@ async def process_input(  # noqa: C901
         Args:
             filepath (Union[str, List[str]]):
                 Path to the audio file or list of paths to the audio files to process.
+            batch_size (Union[int, None]):
+                The batch size to use for the transcription. For tensorrt-llm whisper engine only.
             offset_start (Union[float, None]):
                 The start time of the audio file to process.
             offset_end (Union[float, None]):
@@ -502,6 +507,7 @@ async def process_input(  # noqa: C901
                 execution=diarization_execution, num_speakers=num_speakers
             ),
             duration=duration,
+            batch_size=batch_size,
             multi_channel=multi_channel,
             offset_start=offset_start,
             post_processing=PostProcessingTask(),
@@ -1056,6 +1062,7 @@ async def process_input(
         try:
             result = self.transcription_service(
                 audio=data.audio,
+                batch_size=data.batch_size,
                 source_lang=data.source_lang,
                 model_index=gpu_index,
                 suppress_blank=False,

diff --git a/src/wordcab_transcribe/services/transcribe_service.py b/src/wordcab_transcribe/services/transcribe_service.py
@@ -126,7 +126,7 @@ def __call__(
         ],
         source_lang: str,
         model_index: int,
-        batch_size: int = 24,
+        batch_size: int = 1,
         num_beams: int = 1,
         suppress_blank: bool = False,
         vocab: Union[List[str], None] = None,
@@ -150,7 +150,7 @@ def __call__(
             model_index (int):
                 Index of the model to use.
             batch_size (int):
-                Batch size to use during generation.
+                Batch size to use during generation. Only used for tensorrt_llm engine.
             num_beams (int):
                 Number of beams to use during generation.
             suppress_blank (bool):