R3gm · Jalaj-KT · Jul 26, 2024 · Aug 22, 2024 · Aug 22, 2024 · Sep 16, 2024
diff --git a/app_rvc.py b/app_rvc.py
@@ -433,6 +433,7 @@ def multilingual_media_conversion(
         enable_cache=True,
         custom_voices=False,
         custom_voices_workers=1,
+        transcription_vocabulary="",
         is_gui=False,
         progress=gr.Progress(),
     ):
@@ -713,6 +714,7 @@ def multilingual_media_conversion(
                         SOURCE_LANGUAGE,
                         literalize_numbers,
                         segment_duration_limit,
+                        custom_vocab=transcription_vocabulary,
                     )
                 logger.debug(
                     "Transcript complete, "
@@ -1866,6 +1868,11 @@ def submit(value):
                                 label=lg_conf["ctype_label"],
                                 info=lg_conf["ctype_info"],
                             )
+                            transcription_vocabulary_gui = gr.Textbox(
+                                label=lg_conf["transcription_custom_vocabulary_label"],
+                                value="",
+                                info=lg_conf["transcription_custom_vocabulary_info"],
+                            )
                             batch_size = gr.Slider(
                                 minimum=1,
                                 maximum=32,
@@ -2657,6 +2664,7 @@ def update_tts_list():
                 enable_cache_gui,
                 enable_custom_voice,
                 workers_custom_voice,
+                transcription_vocabulary_gui,
                 is_gui_dummy_check,
             ],
             outputs=subs_edit_space,
@@ -2724,6 +2732,7 @@ def update_tts_list():
                 enable_cache_gui,
                 enable_custom_voice,
                 workers_custom_voice,
+                transcription_vocabulary_gui,
                 is_gui_dummy_check,
             ],
             outputs=video_output,

diff --git a/soni_translate/languages_gui.py b/soni_translate/languages_gui.py
@@ -163,6 +163,8 @@
         "srt_file_label": "Upload an SRT subtitle file (will be used instead of the transcription of Whisper)",
         "divide_text_label": "Redivide text segments by:",
         "divide_text_info": "(Experimental) Enter a separator to split existing text segments in the source language. The tool will identify occurrences and create new segments accordingly. Specify multiple separators using |, e.g.: !|?|...|。",
+        "transcription_custom_vocabulary_label": "Custom Vocabulary for transcription",
+        "transcription_custom_vocabulary_info": "Enter comma(,) separated vocabulary/keywords for better transcription quality (for eg. phising, vishing)",
         "diarization_label": "Diarization model",
         "tr_process_label": "Translation process",
         "out_type_label": "Output type",

diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py
@@ -59,9 +59,7 @@
 
 
 def openai_api_whisper(
-    input_audio_file,
-    source_lang=None,
-    chunk_duration=1800
+    input_audio_file, source_lang=None, chunk_duration=1800, custom_vocab=""
 ):
 
     info = sf.info(input_audio_file)
@@ -99,6 +97,7 @@ def openai_api_whisper(
           language=language,
           response_format="verbose_json",
           timestamp_granularities=["segment"],
+            prompt=custom_vocab,
         )
 
         try:
@@ -152,6 +151,7 @@ def transcribe_speech(
     SOURCE_LANGUAGE,
     literalize_numbers=True,
     segment_duration_limit=15,
+    custom_vocab="",
 ):
     """
     Transcribe speech using a whisper model.
@@ -162,6 +162,7 @@ def transcribe_speech(
     - compute_type (str): Type of compute to be used (e.g., 'int8', 'float16').
     - batch_size (int): Batch size for transcription.
     - SOURCE_LANGUAGE (str): Source language for transcription.
+    - custom_vocab (str): Comma separated words for better transcription
 
     Returns:
     - Tuple containing:
@@ -175,13 +176,13 @@ def transcribe_speech(
                 "OpenAI's API Whisper does not support "
                 "the literalization of numbers."
             )
-        return openai_api_whisper(audio_wav, SOURCE_LANGUAGE)
+        return openai_api_whisper(audio_wav, SOURCE_LANGUAGE, custom_vocab=custom_vocab)
 
     # https://github.com/openai/whisper/discussions/277
-    prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else None
-    SOURCE_LANGUAGE = (
-        SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh"
-    )
+    prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else custom_vocab
+    SOURCE_LANGUAGE = SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh"
+
+    logger.debug(f"transcription vocabulary: {prompt}, type: {type(prompt)}")
     asr_options = {
         "initial_prompt": prompt,
         "suppress_numerals": literalize_numbers