Split NeMo diarization process into sub-modules (#180)

* nemo service split init * first segmentation split * update extract_embeddings * move speaker model to EncDecSpeakerLabelModel * simplify audio dataset * finish segmentation part * start mapping between embds and labels * add msdd diarizer and start process * fix last inference + start preds formatting * first diarization implement attempt * correct some nits * fix the post-processing pipeline for new diarization outputs * remove old diarization stuff * remove old diarization nemo stuff * remove nemo config unused stuff * update dockerfile * update the config with the new diarization scales * fix linter * fix multiscale_weights tensor * fix transcribe service * fix tests * fix config extra_languages * fix diarization config tests
Wordcab · Aug 1, 2023 · 7f398a2 · 7f398a2
1 parent fba726f
commit 7f398a2
Show file tree

Hide file tree

Showing 15 changed files with 2,351 additions and 568 deletions.
diff --git a/.env b/.env
@@ -16,16 +16,6 @@ API_PREFIX="/api/v1"
 # Debug mode for FastAPI. It allows for hot reloading when code changes in development.
 DEBUG=True
 #
-# ----------------------------------------------- BATCH CONFIGURATION ------------------------------------------------ #
-#
-# The batch_size parameter is used to control the number of audio files that are processed in parallel.
-# If your server GPU has a lot of memory, you can increase this value to improve performance.
-# For simplicity, we recommend leaving this value at 1, unless you are sure that your GPU has enough memory (> 40GB)
-BATCH_SIZE=1
-# The max_wait parameter is used to control the maximum amount of time (in seconds) that the server will wait for
-# processing the tasks in the queue, if not empty. It's useful only when the batch_size is greater than 1.
-MAX_WAIT=0.1
-#
 # ----------------------------------------------- MODELS CONFIGURATION ----------------------------------------------- #
 #
 # ----------------------------------------------------- WHISPER ------------------------------------------------------ #
@@ -47,20 +37,20 @@ WHISPER_MODEL="large-v2"
 COMPUTE_TYPE="float16"
 # The extra_languages parameter is used to control the languages that need an extra model to be loaded.
 # You can specify multiple languages separated by a comma. The available languages are: `he` (Hebrew).
-EXTRA_LANGUAGES=""
-#
-# --------------------------------------------------- NVIDIA NEMO ---------------------------------------------------- #
-#
-# The nemo_domain_type define the configuration file used by the model for diarization. The available options are:
-# `general`, `meeting` and `telephonic`. The default value is `telephonic`. If you choose another type, you will need
-# to provide a custom model
-NEMO_DOMAIN_TYPE="telephonic"
-# The nemo_storage_path parameter is used to control the path where the NeuralDiarizer from the NeMo toolkit will
-# store the diarization models.
-NEMO_STORAGE_PATH="nemo_storage"
-# The nemo_output_path parameter is used to control the path where the NeuralDiarizer from the NeMo toolkit will
-# store the diarization outputs.
-NEMO_OUTPUT_PATH="nemo_outputs"
+EXTRA_LANGUAGES=
+#
+# --------------------------------------------------- DIARIZATION ---------------------------------------------------- #
+#
+# In a MSDD (Multiscale Diarization Decoder) model, the diarization model is trained on multiple window lengths.
+# The window_lengths are specified in seconds, and separated by a comma. If not specified, the default value will
+# be "1.5, 1.25, 1.0, 0.75, 0.5".
+WINDOW_LENGTHS="1.5,1.25,1.0,0.75,0.5"
+# The shift_lengths are specified in seconds, and separated by a comma. If not specified, the default value will
+# be "0.75, 0.625, 0.5, 0.375, 0.25".
+SHIFT_LENGTHS="0.75,0.625,0.5,0.375,0.25"
+# The multiscale_weights are float values separated by a comma. If not specified, the default value will be
+# "1.0, 1.0, 1.0, 1.0, 1.0".
+MULTISCALE_WEIGHTS="1.0,1.0,1.0,1.0,1.0"
 #
 # ---------------------------------------------- ASR TYPE CONFIGURATION ---------------------------------------------- #
 #

diff --git a/.gitignore b/.gitignore
@@ -10,9 +10,10 @@ __pycache__/
 /data/
 /dist/
 /docs/_build/
+/infer_out_dir/
 /src/*.egg-info/
+async_bench.py
 test.ipynb
 test.py
-async_bench.py
 whisper_model
 whisper_model_he
diff --git a/Dockerfile b/Dockerfile
@@ -31,7 +31,6 @@ COPY ./poetry.lock ./pyproject.toml ./
 RUN poetry install --only main
 
 COPY ./wordcab_transcribe /app/wordcab_transcribe
-COPY ./config /app/config
 COPY ./.env /app/.env
 
 WORKDIR /app

diff --git a/config/nemo/diar_infer_general.yaml b/config/nemo/diar_infer_general.yaml
diff --git a/config/nemo/diar_infer_meeting.yaml b/config/nemo/diar_infer_meeting.yaml