Wordcab · aleksandr-smechov · May 26, 2024 · May 26, 2024 · May 26, 2024 · May 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ __pycache__/
 .coverage.*
 .DS_Store
 .env_dev
+.env_*
 .nox/
 .pytest_cache/
 .python-version

diff --git a/Dockerfile b/Dockerfile
@@ -1,12 +1,12 @@
 FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 AS runtime
 
 ENV NVIDIA_DRIVER_CAPABILITIES ${NVIDIA_DRIVER_CAPABILITIES:-compute,utility}
-
 ENV PYTHONUNBUFFERED=1
-
 ENV DEBIAN_FRONTEND=noninteractive
+ENV MPI4PY_VERSION="3.1.5"
+ENV RELEASE_URL="https://github.com/mpi4py/mpi4py/archive/refs/tags/${MPI4PY_VERSION}.tar.gz"
 
-RUN apt update && apt install -y \
+RUN apt-get update && apt-get install -y --no-install-recommends \
     libsndfile1 \
     software-properties-common \
     ffmpeg \
@@ -28,9 +28,6 @@ RUN apt update && apt install -y \
     python3-dev \
     liblzma-dev \
     libsqlite3-dev \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN apt update && apt install -y \
     libtiff-tools=4.3.0-6ubuntu0.8 \
     libtiff5=4.3.0-6ubuntu0.8 \
     libgnutls30=3.7.3-4ubuntu1.5 \
@@ -42,7 +39,8 @@ RUN apt update && apt install -y \
     login=1:4.8.1-2ubuntu2.2 \
     passwd=1:4.8.1-2ubuntu2.2 \
     uidmap=1:4.8.1-2ubuntu2.2 \
-    binutils=2.38-4ubuntu2.6
+    binutils=2.38-4ubuntu2.6 \
+    && rm -rf /var/lib/apt/lists/*
 
 RUN cd /tmp && \
     wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
@@ -57,20 +55,30 @@ RUN cd /tmp && \
 RUN export CUDNN_PATH=$(python -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))') && \
     echo 'export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:'${CUDNN_PATH} >> ~/.bashrc
 
-ENV MPI4PY_VERSION="3.1.5"
-ENV RELEASE_URL="https://github.com/mpi4py/mpi4py/archive/refs/tags/${MPI4PY_VERSION}.tar.gz"
-
 RUN curl -L ${RELEASE_URL} | tar -zx -C /tmp \
     && sed -i 's/>= 40\\.9\\.0/>= 40.9.0, < 69/g' /tmp/mpi4py-${MPI4PY_VERSION}/pyproject.toml \
     && pip install /tmp/mpi4py-${MPI4PY_VERSION} \
     && rm -rf /tmp/mpi4py*
 
 RUN python -m pip install pip --upgrade
 
+COPY pre_requirements.txt .
+COPY requirements.txt .
+
+RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com -r pre_requirements.txt -r requirements.txt
+
 WORKDIR /app
 
-COPY . .
+RUN git clone https://github.com/NVIDIA/NeMo.git ./nemo_local && \
+    cd ./nemo_local && \
+    git config --global user.email "[email protected]" && \
+    git config --global user.name "Your Name" && \
+    git fetch origin pull/9114/head:pr9114 && \
+    git merge pr9114 && \
+    pip install -e ".[asr]"
+
+ENV PYTHONPATH="/app/src"
 
-RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com .[runtime]
+COPY . .
 
 CMD ["uvicorn", "--host=0.0.0.0", "--port=5001", "src.wordcab_transcribe.main:app"]
diff --git a/pre_requirements.txt b/pre_requirements.txt
@@ -0,0 +1,13 @@
+argon2-cffi==23.1.0
+fastapi==0.110.0
+python-jose[cryptography]==3.3.0
+python-multipart==0.0.9
+shortuuid==1.0.13
+svix==1.21.0
+uvicorn==0.29.0
+websockets==12.0
+tensorrt_llm==0.11.0.dev2024052100
+Cython==3.0.10
+youtokentome @ git+https://github.com/gburlet/YouTokenToMe.git@dependencies
+deepmultilingualpunctuation==1.0.1
+pyannote.audio==3.2.0
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,27 @@
+aiohttp==3.9.3
+aiofiles==23.2.1
+boto3
+faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz
+ffmpeg-python==0.2.0
+transformers==4.38.2
+librosa==0.10.1
+loguru==0.7.2
+nltk==3.8.1
+numpy==1.26.4
+onnxruntime==1.17.1
+pandas==2.2.1
+pydantic==2.6.4
+python-dotenv==1.0.1
+tensorshare==0.1.1
+torch==2.2.2
+torchaudio==2.2.2
+wget==3.2.0
+yt-dlp==2024.3.10
+tiktoken==0.6.0
+datasets==2.18.0
+kaldialign==0.9.0
+openai-whisper==v20231117
+soundfile==0.12.1
+safetensors==0.4.2
+janus==1.0.0
+backports.lzma==0.0.14
diff --git a/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py b/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py
@@ -22,10 +22,10 @@
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.functional import LayerNormPositionType, LayerNormType
 from tensorrt_llm.logger import logger
-from tensorrt_llm.models import quantize_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
+from tensorrt_llm.quantization.quantize_by_modelopt import quantize_model
 from weight import load_decoder_weight, load_encoder_weight
 
 MODEL_ENCODER_NAME = "whisper_encoder"
@@ -316,32 +316,49 @@ def build_decoder(model, args):
     )
 
     tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel(
-        num_layers=model_metadata["n_text_layer"],
-        num_heads=model_metadata["n_text_head"],
-        hidden_size=model_metadata["n_text_state"],
-        ffn_hidden_size=4 * model_metadata["n_text_state"],
-        encoder_hidden_size=model_metadata["n_text_state"],
-        encoder_num_heads=model_metadata["n_text_head"],
-        vocab_size=model_metadata["n_vocab"],
-        head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
-        max_position_embeddings=model_metadata["n_text_ctx"],
-        has_position_embedding=True,
-        relative_attention=False,
-        max_distance=0,
-        num_buckets=0,
-        has_embedding_layernorm=False,
-        has_embedding_scale=False,
-        q_scaling=1.0,
-        has_attention_qkvo_bias=True,
-        has_mlp_bias=True,
-        has_model_final_layernorm=True,
-        layernorm_eps=1e-5,
-        layernorm_position=LayerNormPositionType.pre_layernorm,
-        layernorm_type=LayerNormType.LayerNorm,
-        hidden_act="gelu",
-        rescale_before_lm_head=False,
-        dtype=str_dtype_to_trt(args.dtype),
-        logits_dtype=str_dtype_to_trt(args.dtype),
+        tensorrt_llm.models.modeling_utils.PretrainedConfig(
+            architecture="whisper",
+            dtype=str_dtype_to_trt(args.dtype),
+            logits_dtype=str_dtype_to_trt(args.dtype),
+            vocab_size=model_metadata["n_vocab"],
+            max_position_embeddings=model_metadata["n_text_ctx"],
+            hidden_size=model_metadata["n_text_state"],
+            num_hidden_layers=model_metadata["n_text_layer"],
+            num_attention_heads=model_metadata["n_text_head"],
+            num_key_value_heads=model_metadata["n_text_head"],
+            hidden_act="gelu",
+            intermediate_size=4 * model_metadata["n_text_state"],
+            norm_epsilon=1e-5,
+            position_embedding_type="learned_absolute",
+            world_size=1,
+            tp_size=1,
+            pp_size=1,
+            gpus_per_node=1,
+            quantization=tensorrt_llm.models.modeling_utils.QuantConfig(),
+            head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
+            num_layers=model_metadata["n_text_layer"],
+            num_heads=model_metadata["n_text_head"],
+            ffn_hidden_size=4 * model_metadata["n_text_state"],
+            encoder_hidden_size=model_metadata["n_text_state"],
+            encoder_num_heads=model_metadata["n_text_head"],
+            has_position_embedding=True,
+            relative_attention=False,
+            max_distance=0,
+            num_buckets=0,
+            has_embedding_layernorm=False,
+            has_embedding_scale=False,
+            q_scaling=1.0,
+            has_attention_qkvo_bias=True,
+            has_mlp_bias=True,
+            has_model_final_layernorm=True,
+            layernorm_eps=1e-5,
+            layernorm_position=LayerNormPositionType.pre_layernorm,
+            layernorm_type=LayerNormType.LayerNorm,
+            rescale_before_lm_head=False,
+            encoder_head_size=model_metadata["n_text_state"]
+            // model_metadata["n_text_head"],  # Added missing variable
+            skip_cross_qkv=False,
+        )
     )
 
     if args.use_weight_only:
@@ -377,7 +394,7 @@ def build_decoder(model, args):
             model_metadata["n_audio_ctx"],
         )
 
-        tensorrt_llm_whisper_decoder(*inputs)
+        tensorrt_llm_whisper_decoder(**inputs)
 
         if args.debug_mode:
             for k, v in tensorrt_llm_whisper_decoder.named_network_outputs():

diff --git a/src/wordcab_transcribe/services/asr_service.py b/src/wordcab_transcribe/services/asr_service.py
@@ -336,10 +336,12 @@ def create_transcription_local_service(self) -> None:
     def create_diarization_local_service(self) -> None:
         """Create a local diarization service."""
         if settings.diarization_backend == "longform-diarizer":
+            logger.info("Using LongFormDiarizeService for diarization.")
             self.local_services.diarization = LongFormDiarizeService(
                 device=self.device,
             )
         else:
+            logger.info("Using DiarizeService for diarization.")
             self.local_services.diarization = DiarizeService(
                 device=self.device,
                 device_index=self.device_index,

diff --git a/src/wordcab_transcribe/services/post_processing_service.py b/src/wordcab_transcribe/services/post_processing_service.py
@@ -362,6 +362,7 @@ def reconstruct_multi_channel_utterances(
         sentences = []
         for speaker, word in transcript_words:
             start_t, end_t, text = word.start, word.end, word.word
+            print(speaker, previous_speaker, text)
 
             if speaker != previous_speaker:
                 sentences.append(current_sentence)