Merge pull request #31 from Wordcab/12-diarization-should-be-improved…

…-for-audio-with-2-speakers Multiple improvements: diarization, timestamps, Dockerfile, dependencies...
Wordcab · May 5, 2023 · 1e4295e · 1e4295e
2 parents 4f312ca + 9aff56e
commit 1e4295e
Show file tree

Hide file tree

Showing 18 changed files with 3,393 additions and 1,593 deletions.
diff --git a/.env b/.env
@@ -1,10 +1,12 @@
 PROJECT_NAME="Wordcab Transcribe"
 VERSION="0.1.0"
-DESCRIPTION="ASR FastAPI server using faster-whisper and pyannote-audio."
+DESCRIPTION="💬 ASR FastAPI server using faster-whisper and NVIDIA NeMo."
 API_PREFIX="/api/v1"
 DEBUG=True
 BATCH_SIZE=1
 MAX_WAIT=0.1
 WHISPER_MODEL="large-v2"
-EMBEDDINGS_MODEL="speechbrain/spkrec-ecapa-voxceleb"
 COMPUTE_TYPE="int8_float16"
+NEMO_DOMAIN_TYPE="telephonic"  # Can be general, meeting or telephonic based on domain type of the audio file
+NEMO_STORAGE_PATH="nemo_storage"
+NEMO_OUTPUT_PATH="nemo_outputs"
diff --git a/Dockerfile b/Dockerfile
@@ -1,19 +1,17 @@
-FROM nvidia/cuda:11.7.0-devel-ubuntu22.04
+FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
 
+ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y \
     git \
     curl \
     ffmpeg \
     libsndfile1 \
-    software-properties-common
-RUN add-apt-repository ppa:deadsnakes/ppa \
-    && apt install -y python3.10 \
-    && rm -rf /var/lib/apt/lists/* \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
-
+    software-properties-common \
+    python3-pip
+
 COPY requirements.txt /requirements.txt
-RUN python3.10 -m pip install -r requirements.txt
-RUN python3.10 -m pip install --upgrade torch==1.13.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117
+RUN python3 -m pip install -r requirements.txt
+RUN python3 -m pip install --upgrade torch==2.0.0+cu118 torchaudio==2.0.1 --extra-index-url https://download.pytorch.org/whl/cu118
 
 COPY . /app
 WORKDIR /app

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Wordcab Transcribe 💬
 
-FastAPI based API for transcribing audio files using [`faster-whisper`](https://github.com/guillaumekln/faster-whisper) and [`pyannote-audio`](https://github.com/pyannote/pyannote-audio)
+FastAPI based API for transcribing audio files using [`faster-whisper`](https://github.com/guillaumekln/faster-whisper) and [`NVIDIA NeMo`](https://github.com/NVIDIA/NeMo)
 
 More details on this project on this [blog post](https://wordcab.github.io/wordcab-posts/blog/2023/03/31/wordcab-transcribe/).
 
@@ -54,9 +54,8 @@ import requests
 
 headers = {"accept": "application/json"}
 data = {
-  "num_speakers": 1,  # optional, default is 0
   "source_lang": "en",  # optional, default is "en"
-  "timestamps": "seconds",  # optional, default is "seconds"
+  "timestamps": "s",  # optional, default is "s". Can be "s", "ms" or "hms".
 }
 
 filepath = "tests/sample_1.mp3"  # or any other audio file. Prefer wav files.
@@ -80,9 +79,8 @@ import requests
 headers = {"accept": "application/json", "Content-Type": "application/json"}
 params = {"url": "https://youtu.be/JZ696sbfPHs"}
 data = {
-  "num_speakers": 1,  # optional, default is 0
   "source_lang": "en",  # optional, default is "en"
-  "timestamps": "seconds",  # optional, default is "seconds"
+  "timestamps": "s",  # optional, default is "s". Can be "s", "ms" or "hms".
 }
 
 response = requests.post(

diff --git a/config/nemo/diar_infer_general.yaml b/config/nemo/diar_infer_general.yaml
@@ -0,0 +1,91 @@
+# This YAML file is created for all types of offline speaker diarization inference tasks in `<NeMo git root>/example/speaker_tasks/diarization` folder.
+# The inference parameters for VAD, speaker embedding extractor, clustering module, MSDD module, ASR decoder are all included in this YAML file.
+# All the keys under `diarizer` key (`vad`, `speaker_embeddings`, `clustering`, `msdd_model`, `asr`) can be selectively used for its own purpose and also can be ignored if the module is not used.
+# The configurations in this YAML file is optimized to show balanced performances on various types of domain. VAD is optimized on multilingual ASR datasets and diarizer is optimized on DIHARD3 development set.
+# An example line in an input manifest file (`.json` format):
+# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-", "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath": "/path/to/uem/file"}
+name: &name "ClusterDiarizer"
+
+num_workers: 1
+sample_rate: 16000
+batch_size: 64
+device: null # can specify a specific device, i.e: cuda:1 (default cuda if cuda available, else cpu)
+verbose: True # enable additional logging
+
+diarizer:
+  manifest_filepath: ???
+  out_dir: ???
+  oracle_vad: False # If True, uses RTTM files provided in the manifest file to get speech activity (VAD) timestamps
+  collar: 0.25 # Collar value for scoring
+  ignore_overlap: True # Consider or ignore overlap segments while scoring
+
+  vad:
+    model_path: vad_multilingual_marblenet # .nemo local model path or pretrained VAD model name
+    external_vad_manifest: null # This option is provided to use external vad and provide its speech activity labels for speaker embeddings extraction. Only one of model_path or external_vad_manifest should be set
+
+    parameters: # Tuned by detection error rate (false alarm + miss) on multilingual ASR evaluation datasets
+      window_length_in_sec: 0.63 # Window length in sec for VAD context input
+      shift_length_in_sec: 0.08 # Shift length in sec for generate frame level VAD prediction
+      smoothing: False # False or type of smoothing method (eg: median)
+      overlap: 0.5 # Overlap ratio for overlapped mean/median smoothing filter
+      onset: 0.5 # Onset threshold for detecting the beginning and end of a speech
+      offset: 0.3 # Offset threshold for detecting the end of a speech
+      pad_onset: 0.2 # Adding durations before each speech segment
+      pad_offset: 0.2 # Adding durations after each speech segment
+      min_duration_on: 0.5 # Threshold for small non_speech deletion
+      min_duration_off: 0.5 # Threshold for short speech segment deletion
+      filter_speech_first: True
+
+  speaker_embeddings:
+    model_path: titanet_large # .nemo local model path or pretrained model name (titanet_large, ecapa_tdnn or speakerverification_speakernet)
+    parameters:
+      window_length_in_sec: [1.9, 1.2, 0.5] # Window length(s) in sec (floating-point number). either a number or a list. ex) 1.5 or [1.5,1.0,0.5]
+      shift_length_in_sec: [0.95, 0.6, 0.25] # Shift length(s) in sec (floating-point number). either a number or a list. ex) 0.75 or [0.75,0.5,0.25]
+      multiscale_weights: [1, 1, 1] # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. ex) [0.33,0.33,0.33]
+      save_embeddings: True # If True, save speaker embeddings in pickle format. This should be True if clustering result is used for other models, such as `msdd_model`.
+
+  clustering:
+    parameters:
+      oracle_num_speakers: False # If True, use num of speakers value provided in manifest file.
+      max_num_speakers: 8 # Max number of speakers for each recording. If an oracle number of speakers is passed, this value is ignored.
+      enhanced_count_thres: 80 # If the number of segments is lower than this number, enhanced speaker counting is activated.
+      max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold.
+      sparse_search_volume: 10 # The higher the number, the more values will be examined with more time.
+      maj_vote_spk_count: False # If True, take a majority vote on multiple p-values to estimate the number of speakers.
+
+  msdd_model:
+    model_path: null # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD)
+    parameters:
+      use_speaker_model_from_ckpt: True # If True, use speaker embedding model in checkpoint. If False, the provided speaker embedding model in config will be used.
+      infer_batch_size: 25 # Batch size for MSDD inference.
+      sigmoid_threshold: [0.7] # Sigmoid threshold for generating binarized speaker labels. The smaller the more generous on detecting overlaps.
+      seq_eval_mode: False # If True, use oracle number of speaker and evaluate F1 score for the given speaker sequences. Default is False.
+      split_infer: True # If True, break the input audio clip to short sequences and calculate cluster average embeddings for inference.
+      diar_window_length: 50 # The length of split short sequence when split_infer is True.
+      overlap_infer_spk_limit: 5 # If the estimated number of speakers are larger than this number, overlap speech is not estimated.
+
+  asr:
+    model_path: null # Provide NGC cloud ASR model name. stt_en_conformer_ctc_* models are recommended for diarization purposes.
+    parameters:
+      asr_based_vad: False # if True, speech segmentation for diarization is based on word-timestamps from ASR inference.
+      asr_based_vad_threshold: 1.0 # Threshold (in sec) that caps the gap between two words when generating VAD timestamps using ASR based VAD.
+      asr_batch_size: null # Batch size can be dependent on each ASR model. Default batch sizes are applied if set to null.
+      decoder_delay_in_sec: null # Native decoder delay. null is recommended to use the default values for each ASR model.
+      word_ts_anchor_offset: null # Offset to set a reference point from the start of the word. Recommended range of values is [-0.05  0.2].
+      word_ts_anchor_pos: "start" # Select which part of the word timestamp we want to use. The options are: 'start', 'end', 'mid'.
+      fix_word_ts_with_VAD: False # Fix the word timestamp using VAD output. You must provide a VAD model to use this feature.
+      colored_text: False # If True, use colored text to distinguish speakers in the output transcript.
+      print_time: True # If True, the start and end time of each speaker turn is printed in the output transcript.
+      break_lines: False # If True, the output transcript breaks the line to fix the line width (default is 90 chars)
+
+    ctc_decoder_parameters: # Optional beam search decoder (pyctcdecode)
+      pretrained_language_model: null # KenLM model file: .arpa model file or .bin binary file.
+      beam_width: 32
+      alpha: 0.5
+      beta: 2.5
+
+    realigning_lm_parameters: # Experimental feature
+      arpa_language_model: null # Provide a KenLM language model in .arpa format.
+      min_number_of_words: 3 # Min number of words for the left context.
+      max_number_of_words: 10 # Max number of words for the right context.
+      logprob_diff_threshold: 1.2 # The threshold for the difference between two log probability values from two hypotheses.
diff --git a/config/nemo/diar_infer_meeting.yaml b/config/nemo/diar_infer_meeting.yaml
@@ -0,0 +1,91 @@
+# This YAML file is created for all types of offline speaker diarization inference tasks in `<NeMo git root>/example/speaker_tasks/diarization` folder.
+# The inference parameters for VAD, speaker embedding extractor, clustering module, MSDD module, ASR decoder are all included in this YAML file.
+# All the keys under `diarizer` key (`vad`, `speaker_embeddings`, `clustering`, `msdd_model`, `asr`) can be selectively used for its own purpose and also can be ignored if the module is not used.
+# The configurations in this YAML file is suitable for 3~5 speakers participating in a meeting and may not show the best performance on other types of dialogues.
+# An example line in an input manifest file (`.json` format):
+# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-", "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath": "/path/to/uem/file"}
+name: &name "ClusterDiarizer"
+
+num_workers: 1
+sample_rate: 16000
+batch_size: 64
+device: null # can specify a specific device, i.e: cuda:1 (default cuda if cuda available, else cpu)
+verbose: True # enable additional logging
+
+diarizer:
+  manifest_filepath: ???
+  out_dir: ???
+  oracle_vad: False # If True, uses RTTM files provided in the manifest file to get speech activity (VAD) timestamps
+  collar: 0.25 # Collar value for scoring
+  ignore_overlap: True # Consider or ignore overlap segments while scoring
+
+  vad:
+    model_path: vad_multilingual_marblenet # .nemo local model path or pretrained VAD model name
+    external_vad_manifest: null # This option is provided to use external vad and provide its speech activity labels for speaker embeddings extraction. Only one of model_path or external_vad_manifest should be set
+
+    parameters: # Tuned parameters for CH109 (using the 11 multi-speaker sessions as dev set)
+      window_length_in_sec: 0.63 # Window length in sec for VAD context input
+      shift_length_in_sec: 0.01 # Shift length in sec for generate frame level VAD prediction
+      smoothing: False # False or type of smoothing method (eg: median)
+      overlap: 0.5 # Overlap ratio for overlapped mean/median smoothing filter
+      onset: 0.9 # Onset threshold for detecting the beginning and end of a speech
+      offset: 0.5 # Offset threshold for detecting the end of a speech
+      pad_onset: 0 # Adding durations before each speech segment
+      pad_offset: 0 # Adding durations after each speech segment
+      min_duration_on: 0 # Threshold for small non_speech deletion
+      min_duration_off: 0.6 # Threshold for short speech segment deletion
+      filter_speech_first: True
+
+  speaker_embeddings:
+    model_path: titanet_large # .nemo local model path or pretrained model name (titanet_large, ecapa_tdnn or speakerverification_speakernet)
+    parameters:
+      window_length_in_sec: [3.0, 2.5, 2.0, 1.5, 1.0, 0.5] # Window length(s) in sec (floating-point number). either a number or a list. ex) 1.5 or [1.5,1.0,0.5]
+      shift_length_in_sec: [1.5, 1.25, 1.0, 0.75, 0.5, 0.25] # Shift length(s) in sec (floating-point number). either a number or a list. ex) 0.75 or [0.75,0.5,0.25]
+      multiscale_weights: [1, 1, 1, 1, 1, 1] # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. ex) [0.33,0.33,0.33]
+      save_embeddings: True # If True, save speaker embeddings in pickle format. This should be True if clustering result is used for other models, such as `msdd_model`.
+
+  clustering:
+    parameters:
+      oracle_num_speakers: False # If True, use num of speakers value provided in manifest file.
+      max_num_speakers: 8 # Max number of speakers for each recording. If an oracle number of speakers is passed, this value is ignored.
+      enhanced_count_thres: 80 # If the number of segments is lower than this number, enhanced speaker counting is activated.
+      max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold.
+      sparse_search_volume: 30 # The higher the number, the more values will be examined with more time.
+      maj_vote_spk_count: False # If True, take a majority vote on multiple p-values to estimate the number of speakers.
+
+  msdd_model:
+    model_path: null # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD)
+    parameters:
+      use_speaker_model_from_ckpt: True # If True, use speaker embedding model in checkpoint. If False, the provided speaker embedding model in config will be used.
+      infer_batch_size: 25 # Batch size for MSDD inference.
+      sigmoid_threshold: [0.7] # Sigmoid threshold for generating binarized speaker labels. The smaller the more generous on detecting overlaps.
+      seq_eval_mode: False # If True, use oracle number of speaker and evaluate F1 score for the given speaker sequences. Default is False.
+      split_infer: True # If True, break the input audio clip to short sequences and calculate cluster average embeddings for inference.
+      diar_window_length: 50 # The length of split short sequence when split_infer is True.
+      overlap_infer_spk_limit: 5 # If the estimated number of speakers are larger than this number, overlap speech is not estimated.
+
+  asr:
+    model_path: stt_en_conformer_ctc_large # Provide NGC cloud ASR model name. stt_en_conformer_ctc_* models are recommended for diarization purposes.
+    parameters:
+      asr_based_vad: False # if True, speech segmentation for diarization is based on word-timestamps from ASR inference.
+      asr_based_vad_threshold: 1.0 # Threshold (in sec) that caps the gap between two words when generating VAD timestamps using ASR based VAD.
+      asr_batch_size: null # Batch size can be dependent on each ASR model. Default batch sizes are applied if set to null.
+      decoder_delay_in_sec: null # Native decoder delay. null is recommended to use the default values for each ASR model.
+      word_ts_anchor_offset: null # Offset to set a reference point from the start of the word. Recommended range of values is [-0.05  0.2].
+      word_ts_anchor_pos: "start" # Select which part of the word timestamp we want to use. The options are: 'start', 'end', 'mid'.
+      fix_word_ts_with_VAD: False # Fix the word timestamp using VAD output. You must provide a VAD model to use this feature.
+      colored_text: False # If True, use colored text to distinguish speakers in the output transcript.
+      print_time: True # If True, the start and end time of each speaker turn is printed in the output transcript.
+      break_lines: False # If True, the output transcript breaks the line to fix the line width (default is 90 chars)
+
+    ctc_decoder_parameters: # Optional beam search decoder (pyctcdecode)
+      pretrained_language_model: null # KenLM model file: .arpa model file or .bin binary file.
+      beam_width: 32
+      alpha: 0.5
+      beta: 2.5
+
+    realigning_lm_parameters: # Experimental feature
+      arpa_language_model: null # Provide a KenLM language model in .arpa format.
+      min_number_of_words: 3 # Min number of words for the left context.
+      max_number_of_words: 10 # Max number of words for the right context.
+      logprob_diff_threshold: 1.2 # The threshold for the difference between two log probability values from two hypotheses.