Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include option to add vocabulary for better transcription #81

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions app_rvc.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@ def multilingual_media_conversion(
enable_cache=True,
custom_voices=False,
custom_voices_workers=1,
transcription_vocabulary="",
is_gui=False,
progress=gr.Progress(),
):
Expand Down Expand Up @@ -713,6 +714,7 @@ def multilingual_media_conversion(
SOURCE_LANGUAGE,
literalize_numbers,
segment_duration_limit,
custom_vocab=transcription_vocabulary,
)
logger.debug(
"Transcript complete, "
Expand Down Expand Up @@ -1866,6 +1868,11 @@ def submit(value):
label=lg_conf["ctype_label"],
info=lg_conf["ctype_info"],
)
transcription_vocabulary_gui = gr.Textbox(
label=lg_conf["transcription_custom_vocabulary_label"],
value="",
info=lg_conf["transcription_custom_vocabulary_info"],
)
batch_size = gr.Slider(
minimum=1,
maximum=32,
Expand Down Expand Up @@ -2657,6 +2664,7 @@ def update_tts_list():
enable_cache_gui,
enable_custom_voice,
workers_custom_voice,
transcription_vocabulary_gui,
is_gui_dummy_check,
],
outputs=subs_edit_space,
Expand Down Expand Up @@ -2724,6 +2732,7 @@ def update_tts_list():
enable_cache_gui,
enable_custom_voice,
workers_custom_voice,
transcription_vocabulary_gui,
is_gui_dummy_check,
],
outputs=video_output,
Expand Down
2 changes: 2 additions & 0 deletions soni_translate/languages_gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@
"srt_file_label": "Upload an SRT subtitle file (will be used instead of the transcription of Whisper)",
"divide_text_label": "Redivide text segments by:",
"divide_text_info": "(Experimental) Enter a separator to split existing text segments in the source language. The tool will identify occurrences and create new segments accordingly. Specify multiple separators using |, e.g.: !|?|...|。",
"transcription_custom_vocabulary_label": "Custom Vocabulary for transcription",
"transcription_custom_vocabulary_info": "Enter comma(,) separated vocabulary/keywords for better transcription quality (for eg. phising, vishing)",
"diarization_label": "Diarization model",
"tr_process_label": "Translation process",
"out_type_label": "Output type",
Expand Down
17 changes: 9 additions & 8 deletions soni_translate/speech_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,7 @@


def openai_api_whisper(
input_audio_file,
source_lang=None,
chunk_duration=1800
input_audio_file, source_lang=None, chunk_duration=1800, custom_vocab=""
):

info = sf.info(input_audio_file)
Expand Down Expand Up @@ -99,6 +97,7 @@ def openai_api_whisper(
language=language,
response_format="verbose_json",
timestamp_granularities=["segment"],
prompt=custom_vocab,
)

try:
Expand Down Expand Up @@ -152,6 +151,7 @@ def transcribe_speech(
SOURCE_LANGUAGE,
literalize_numbers=True,
segment_duration_limit=15,
custom_vocab="",
):
"""
Transcribe speech using a whisper model.
Expand All @@ -162,6 +162,7 @@ def transcribe_speech(
- compute_type (str): Type of compute to be used (e.g., 'int8', 'float16').
- batch_size (int): Batch size for transcription.
- SOURCE_LANGUAGE (str): Source language for transcription.
- custom_vocab (str): Comma separated words for better transcription

Returns:
- Tuple containing:
Expand All @@ -175,13 +176,13 @@ def transcribe_speech(
"OpenAI's API Whisper does not support "
"the literalization of numbers."
)
return openai_api_whisper(audio_wav, SOURCE_LANGUAGE)
return openai_api_whisper(audio_wav, SOURCE_LANGUAGE, custom_vocab=custom_vocab)

# https://github.com/openai/whisper/discussions/277
prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else None
SOURCE_LANGUAGE = (
SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh"
)
prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else custom_vocab
SOURCE_LANGUAGE = SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh"

logger.debug(f"transcription vocabulary: {prompt}, type: {type(prompt)}")
asr_options = {
"initial_prompt": prompt,
"suppress_numerals": literalize_numbers
Expand Down