diff --git a/.gitignore b/.gitignore index 456b9e2..faf8fd1 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/ .coverage.* .DS_Store .env_dev +.env_* .nox/ .pytest_cache/ .python-version diff --git a/Dockerfile b/Dockerfile index 348c278..e305757 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,12 @@ FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 AS runtime ENV NVIDIA_DRIVER_CAPABILITIES ${NVIDIA_DRIVER_CAPABILITIES:-compute,utility} - ENV PYTHONUNBUFFERED=1 - ENV DEBIAN_FRONTEND=noninteractive +ENV MPI4PY_VERSION="3.1.5" +ENV RELEASE_URL="https://github.com/mpi4py/mpi4py/archive/refs/tags/${MPI4PY_VERSION}.tar.gz" -RUN apt update && apt install -y \ +RUN apt-get update && apt-get install -y --no-install-recommends \ libsndfile1 \ software-properties-common \ ffmpeg \ @@ -28,9 +28,6 @@ RUN apt update && apt install -y \ python3-dev \ liblzma-dev \ libsqlite3-dev \ - && rm -rf /var/lib/apt/lists/* - -RUN apt update && apt install -y \ libtiff-tools=4.3.0-6ubuntu0.8 \ libtiff5=4.3.0-6ubuntu0.8 \ libgnutls30=3.7.3-4ubuntu1.5 \ @@ -42,7 +39,8 @@ RUN apt update && apt install -y \ login=1:4.8.1-2ubuntu2.2 \ passwd=1:4.8.1-2ubuntu2.2 \ uidmap=1:4.8.1-2ubuntu2.2 \ - binutils=2.38-4ubuntu2.6 + binutils=2.38-4ubuntu2.6 \ + && rm -rf /var/lib/apt/lists/* RUN cd /tmp && \ wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \ @@ -57,9 +55,6 @@ RUN cd /tmp && \ RUN export CUDNN_PATH=$(python -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))') && \ echo 'export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:'${CUDNN_PATH} >> ~/.bashrc -ENV MPI4PY_VERSION="3.1.5" -ENV RELEASE_URL="https://github.com/mpi4py/mpi4py/archive/refs/tags/${MPI4PY_VERSION}.tar.gz" - RUN curl -L ${RELEASE_URL} | tar -zx -C /tmp \ && sed -i 's/>= 40\\.9\\.0/>= 40.9.0, < 69/g' /tmp/mpi4py-${MPI4PY_VERSION}/pyproject.toml \ && pip install /tmp/mpi4py-${MPI4PY_VERSION} \ @@ -67,10 +62,23 @@ RUN curl -L ${RELEASE_URL} | tar -zx -C /tmp \ RUN python -m pip install pip --upgrade +COPY pre_requirements.txt . +COPY requirements.txt . + +RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com -r pre_requirements.txt -r requirements.txt + WORKDIR /app -COPY . . +RUN git clone https://github.com/NVIDIA/NeMo.git ./nemo_local && \ + cd ./nemo_local && \ + git config --global user.email "you@example.com" && \ + git config --global user.name "Your Name" && \ + git fetch origin pull/9114/head:pr9114 && \ + git merge pr9114 && \ + pip install -e ".[asr]" + +ENV PYTHONPATH="/app/src" -RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com .[runtime] +COPY . . CMD ["uvicorn", "--host=0.0.0.0", "--port=5001", "src.wordcab_transcribe.main:app"] diff --git a/pre_requirements.txt b/pre_requirements.txt new file mode 100644 index 0000000..ba8fef7 --- /dev/null +++ b/pre_requirements.txt @@ -0,0 +1,13 @@ +argon2-cffi==23.1.0 +fastapi==0.110.0 +python-jose[cryptography]==3.3.0 +python-multipart==0.0.9 +shortuuid==1.0.13 +svix==1.21.0 +uvicorn==0.29.0 +websockets==12.0 +tensorrt_llm==0.11.0.dev2024052100 +Cython==3.0.10 +youtokentome @ git+https://github.com/gburlet/YouTokenToMe.git@dependencies +deepmultilingualpunctuation==1.0.1 +pyannote.audio==3.2.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2d4077e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,27 @@ +aiohttp==3.9.3 +aiofiles==23.2.1 +boto3 +faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz +ffmpeg-python==0.2.0 +transformers==4.38.2 +librosa==0.10.1 +loguru==0.7.2 +nltk==3.8.1 +numpy==1.26.4 +onnxruntime==1.17.1 +pandas==2.2.1 +pydantic==2.6.4 +python-dotenv==1.0.1 +tensorshare==0.1.1 +torch==2.2.2 +torchaudio==2.2.2 +wget==3.2.0 +yt-dlp==2024.3.10 +tiktoken==0.6.0 +datasets==2.18.0 +kaldialign==0.9.0 +openai-whisper==v20231117 +soundfile==0.12.1 +safetensors==0.4.2 +janus==1.0.0 +backports.lzma==0.0.14 \ No newline at end of file diff --git a/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py b/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py index 1106a2d..f083fbf 100644 --- a/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py +++ b/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py @@ -22,10 +22,10 @@ from tensorrt_llm.builder import Builder from tensorrt_llm.functional import LayerNormPositionType, LayerNormType from tensorrt_llm.logger import logger -from tensorrt_llm.models import quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode +from tensorrt_llm.quantization.quantize_by_modelopt import quantize_model from weight import load_decoder_weight, load_encoder_weight MODEL_ENCODER_NAME = "whisper_encoder" @@ -316,32 +316,49 @@ def build_decoder(model, args): ) tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel( - num_layers=model_metadata["n_text_layer"], - num_heads=model_metadata["n_text_head"], - hidden_size=model_metadata["n_text_state"], - ffn_hidden_size=4 * model_metadata["n_text_state"], - encoder_hidden_size=model_metadata["n_text_state"], - encoder_num_heads=model_metadata["n_text_head"], - vocab_size=model_metadata["n_vocab"], - head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"], - max_position_embeddings=model_metadata["n_text_ctx"], - has_position_embedding=True, - relative_attention=False, - max_distance=0, - num_buckets=0, - has_embedding_layernorm=False, - has_embedding_scale=False, - q_scaling=1.0, - has_attention_qkvo_bias=True, - has_mlp_bias=True, - has_model_final_layernorm=True, - layernorm_eps=1e-5, - layernorm_position=LayerNormPositionType.pre_layernorm, - layernorm_type=LayerNormType.LayerNorm, - hidden_act="gelu", - rescale_before_lm_head=False, - dtype=str_dtype_to_trt(args.dtype), - logits_dtype=str_dtype_to_trt(args.dtype), + tensorrt_llm.models.modeling_utils.PretrainedConfig( + architecture="whisper", + dtype=str_dtype_to_trt(args.dtype), + logits_dtype=str_dtype_to_trt(args.dtype), + vocab_size=model_metadata["n_vocab"], + max_position_embeddings=model_metadata["n_text_ctx"], + hidden_size=model_metadata["n_text_state"], + num_hidden_layers=model_metadata["n_text_layer"], + num_attention_heads=model_metadata["n_text_head"], + num_key_value_heads=model_metadata["n_text_head"], + hidden_act="gelu", + intermediate_size=4 * model_metadata["n_text_state"], + norm_epsilon=1e-5, + position_embedding_type="learned_absolute", + world_size=1, + tp_size=1, + pp_size=1, + gpus_per_node=1, + quantization=tensorrt_llm.models.modeling_utils.QuantConfig(), + head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"], + num_layers=model_metadata["n_text_layer"], + num_heads=model_metadata["n_text_head"], + ffn_hidden_size=4 * model_metadata["n_text_state"], + encoder_hidden_size=model_metadata["n_text_state"], + encoder_num_heads=model_metadata["n_text_head"], + has_position_embedding=True, + relative_attention=False, + max_distance=0, + num_buckets=0, + has_embedding_layernorm=False, + has_embedding_scale=False, + q_scaling=1.0, + has_attention_qkvo_bias=True, + has_mlp_bias=True, + has_model_final_layernorm=True, + layernorm_eps=1e-5, + layernorm_position=LayerNormPositionType.pre_layernorm, + layernorm_type=LayerNormType.LayerNorm, + rescale_before_lm_head=False, + encoder_head_size=model_metadata["n_text_state"] + // model_metadata["n_text_head"], # Added missing variable + skip_cross_qkv=False, + ) ) if args.use_weight_only: @@ -377,7 +394,7 @@ def build_decoder(model, args): model_metadata["n_audio_ctx"], ) - tensorrt_llm_whisper_decoder(*inputs) + tensorrt_llm_whisper_decoder(**inputs) if args.debug_mode: for k, v in tensorrt_llm_whisper_decoder.named_network_outputs(): diff --git a/src/wordcab_transcribe/services/asr_service.py b/src/wordcab_transcribe/services/asr_service.py index 19bf25e..ca078ed 100644 --- a/src/wordcab_transcribe/services/asr_service.py +++ b/src/wordcab_transcribe/services/asr_service.py @@ -336,10 +336,12 @@ def create_transcription_local_service(self) -> None: def create_diarization_local_service(self) -> None: """Create a local diarization service.""" if settings.diarization_backend == "longform-diarizer": + logger.info("Using LongFormDiarizeService for diarization.") self.local_services.diarization = LongFormDiarizeService( device=self.device, ) else: + logger.info("Using DiarizeService for diarization.") self.local_services.diarization = DiarizeService( device=self.device, device_index=self.device_index, diff --git a/src/wordcab_transcribe/services/post_processing_service.py b/src/wordcab_transcribe/services/post_processing_service.py index 3bc82d8..ffa7660 100644 --- a/src/wordcab_transcribe/services/post_processing_service.py +++ b/src/wordcab_transcribe/services/post_processing_service.py @@ -362,6 +362,7 @@ def reconstruct_multi_channel_utterances( sentences = [] for speaker, word in transcript_words: start_t, end_t, text = word.start, word.end, word.word + print(speaker, previous_speaker, text) if speaker != previous_speaker: sentences.append(current_sentence) diff --git a/src/wordcab_transcribe/services/transcribe_service.py b/src/wordcab_transcribe/services/transcribe_service.py index 8fd20a6..8f2fb9e 100644 --- a/src/wordcab_transcribe/services/transcribe_service.py +++ b/src/wordcab_transcribe/services/transcribe_service.py @@ -251,7 +251,6 @@ def __call__( _outputs = [segment._asdict() for segment in segments] outputs = TranscriptionOutput(segments=_outputs) - else: outputs = self.multi_channel( audio, @@ -319,6 +318,7 @@ def multi_channel( self, audio_list: List[Union[str, torch.Tensor, TensorShare]], source_lang: str, + speaker_id: int, suppress_blank: bool = False, word_timestamps: bool = True, internal_vad: bool = True, @@ -335,6 +335,7 @@ def multi_channel( Args: audio_list (List[Union[str, torch.Tensor, TensorShare]]): List of audio file paths or audio tensors. source_lang (str): Language of the audio file. + speaker_id (int): Speaker ID used in the diarization. suppress_blank (bool): Whether to suppress blank at the beginning of the sampling. word_timestamps (bool): @@ -436,6 +437,7 @@ def multi_channel( word["start"] = round(word["start"], 2) word["end"] = round(word["end"], 2) segment["text"] = segment["text"].strip() + segment["start"] = round(segment.pop("start_time"), 2) segment["end"] = round(segment.pop("end_time"), 2) extra = {