diff --git a/.gitignore b/.gitignore index 456b9e2..faf8fd1 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/ .coverage.* .DS_Store .env_dev +.env_* .nox/ .pytest_cache/ .python-version diff --git a/Dockerfile b/Dockerfile index 348c278..e305757 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,12 @@ FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 AS runtime ENV NVIDIA_DRIVER_CAPABILITIES ${NVIDIA_DRIVER_CAPABILITIES:-compute,utility} - ENV PYTHONUNBUFFERED=1 - ENV DEBIAN_FRONTEND=noninteractive +ENV MPI4PY_VERSION="3.1.5" +ENV RELEASE_URL="https://github.com/mpi4py/mpi4py/archive/refs/tags/${MPI4PY_VERSION}.tar.gz" -RUN apt update && apt install -y \ +RUN apt-get update && apt-get install -y --no-install-recommends \ libsndfile1 \ software-properties-common \ ffmpeg \ @@ -28,9 +28,6 @@ RUN apt update && apt install -y \ python3-dev \ liblzma-dev \ libsqlite3-dev \ - && rm -rf /var/lib/apt/lists/* - -RUN apt update && apt install -y \ libtiff-tools=4.3.0-6ubuntu0.8 \ libtiff5=4.3.0-6ubuntu0.8 \ libgnutls30=3.7.3-4ubuntu1.5 \ @@ -42,7 +39,8 @@ RUN apt update && apt install -y \ login=1:4.8.1-2ubuntu2.2 \ passwd=1:4.8.1-2ubuntu2.2 \ uidmap=1:4.8.1-2ubuntu2.2 \ - binutils=2.38-4ubuntu2.6 + binutils=2.38-4ubuntu2.6 \ + && rm -rf /var/lib/apt/lists/* RUN cd /tmp && \ wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \ @@ -57,9 +55,6 @@ RUN cd /tmp && \ RUN export CUDNN_PATH=$(python -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))') && \ echo 'export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:'${CUDNN_PATH} >> ~/.bashrc -ENV MPI4PY_VERSION="3.1.5" -ENV RELEASE_URL="https://github.com/mpi4py/mpi4py/archive/refs/tags/${MPI4PY_VERSION}.tar.gz" - RUN curl -L ${RELEASE_URL} | tar -zx -C /tmp \ && sed -i 's/>= 40\\.9\\.0/>= 40.9.0, < 69/g' /tmp/mpi4py-${MPI4PY_VERSION}/pyproject.toml \ && pip install /tmp/mpi4py-${MPI4PY_VERSION} \ @@ -67,10 +62,23 @@ RUN curl -L ${RELEASE_URL} | tar -zx -C /tmp \ RUN python -m pip install pip --upgrade +COPY pre_requirements.txt . +COPY requirements.txt . + +RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com -r pre_requirements.txt -r requirements.txt + WORKDIR /app -COPY . . +RUN git clone https://github.com/NVIDIA/NeMo.git ./nemo_local && \ + cd ./nemo_local && \ + git config --global user.email "you@example.com" && \ + git config --global user.name "Your Name" && \ + git fetch origin pull/9114/head:pr9114 && \ + git merge pr9114 && \ + pip install -e ".[asr]" + +ENV PYTHONPATH="/app/src" -RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com .[runtime] +COPY . . CMD ["uvicorn", "--host=0.0.0.0", "--port=5001", "src.wordcab_transcribe.main:app"] diff --git a/pre_requirements.txt b/pre_requirements.txt new file mode 100644 index 0000000..ba8fef7 --- /dev/null +++ b/pre_requirements.txt @@ -0,0 +1,13 @@ +argon2-cffi==23.1.0 +fastapi==0.110.0 +python-jose[cryptography]==3.3.0 +python-multipart==0.0.9 +shortuuid==1.0.13 +svix==1.21.0 +uvicorn==0.29.0 +websockets==12.0 +tensorrt_llm==0.11.0.dev2024052100 +Cython==3.0.10 +youtokentome @ git+https://github.com/gburlet/YouTokenToMe.git@dependencies +deepmultilingualpunctuation==1.0.1 +pyannote.audio==3.2.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2d4077e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,27 @@ +aiohttp==3.9.3 +aiofiles==23.2.1 +boto3 +faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz +ffmpeg-python==0.2.0 +transformers==4.38.2 +librosa==0.10.1 +loguru==0.7.2 +nltk==3.8.1 +numpy==1.26.4 +onnxruntime==1.17.1 +pandas==2.2.1 +pydantic==2.6.4 +python-dotenv==1.0.1 +tensorshare==0.1.1 +torch==2.2.2 +torchaudio==2.2.2 +wget==3.2.0 +yt-dlp==2024.3.10 +tiktoken==0.6.0 +datasets==2.18.0 +kaldialign==0.9.0 +openai-whisper==v20231117 +soundfile==0.12.1 +safetensors==0.4.2 +janus==1.0.0 +backports.lzma==0.0.14 \ No newline at end of file diff --git a/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py b/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py index 1106a2d..f083fbf 100644 --- a/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py +++ b/src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py @@ -22,10 +22,10 @@ from tensorrt_llm.builder import Builder from tensorrt_llm.functional import LayerNormPositionType, LayerNormType from tensorrt_llm.logger import logger -from tensorrt_llm.models import quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode +from tensorrt_llm.quantization.quantize_by_modelopt import quantize_model from weight import load_decoder_weight, load_encoder_weight MODEL_ENCODER_NAME = "whisper_encoder" @@ -316,32 +316,49 @@ def build_decoder(model, args): ) tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel( - num_layers=model_metadata["n_text_layer"], - num_heads=model_metadata["n_text_head"], - hidden_size=model_metadata["n_text_state"], - ffn_hidden_size=4 * model_metadata["n_text_state"], - encoder_hidden_size=model_metadata["n_text_state"], - encoder_num_heads=model_metadata["n_text_head"], - vocab_size=model_metadata["n_vocab"], - head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"], - max_position_embeddings=model_metadata["n_text_ctx"], - has_position_embedding=True, - relative_attention=False, - max_distance=0, - num_buckets=0, - has_embedding_layernorm=False, - has_embedding_scale=False, - q_scaling=1.0, - has_attention_qkvo_bias=True, - has_mlp_bias=True, - has_model_final_layernorm=True, - layernorm_eps=1e-5, - layernorm_position=LayerNormPositionType.pre_layernorm, - layernorm_type=LayerNormType.LayerNorm, - hidden_act="gelu", - rescale_before_lm_head=False, - dtype=str_dtype_to_trt(args.dtype), - logits_dtype=str_dtype_to_trt(args.dtype), + tensorrt_llm.models.modeling_utils.PretrainedConfig( + architecture="whisper", + dtype=str_dtype_to_trt(args.dtype), + logits_dtype=str_dtype_to_trt(args.dtype), + vocab_size=model_metadata["n_vocab"], + max_position_embeddings=model_metadata["n_text_ctx"], + hidden_size=model_metadata["n_text_state"], + num_hidden_layers=model_metadata["n_text_layer"], + num_attention_heads=model_metadata["n_text_head"], + num_key_value_heads=model_metadata["n_text_head"], + hidden_act="gelu", + intermediate_size=4 * model_metadata["n_text_state"], + norm_epsilon=1e-5, + position_embedding_type="learned_absolute", + world_size=1, + tp_size=1, + pp_size=1, + gpus_per_node=1, + quantization=tensorrt_llm.models.modeling_utils.QuantConfig(), + head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"], + num_layers=model_metadata["n_text_layer"], + num_heads=model_metadata["n_text_head"], + ffn_hidden_size=4 * model_metadata["n_text_state"], + encoder_hidden_size=model_metadata["n_text_state"], + encoder_num_heads=model_metadata["n_text_head"], + has_position_embedding=True, + relative_attention=False, + max_distance=0, + num_buckets=0, + has_embedding_layernorm=False, + has_embedding_scale=False, + q_scaling=1.0, + has_attention_qkvo_bias=True, + has_mlp_bias=True, + has_model_final_layernorm=True, + layernorm_eps=1e-5, + layernorm_position=LayerNormPositionType.pre_layernorm, + layernorm_type=LayerNormType.LayerNorm, + rescale_before_lm_head=False, + encoder_head_size=model_metadata["n_text_state"] + // model_metadata["n_text_head"], # Added missing variable + skip_cross_qkv=False, + ) ) if args.use_weight_only: @@ -377,7 +394,7 @@ def build_decoder(model, args): model_metadata["n_audio_ctx"], ) - tensorrt_llm_whisper_decoder(*inputs) + tensorrt_llm_whisper_decoder(**inputs) if args.debug_mode: for k, v in tensorrt_llm_whisper_decoder.named_network_outputs(): diff --git a/src/wordcab_transcribe/services/asr_service.py b/src/wordcab_transcribe/services/asr_service.py index 19bf25e..ca078ed 100644 --- a/src/wordcab_transcribe/services/asr_service.py +++ b/src/wordcab_transcribe/services/asr_service.py @@ -336,10 +336,12 @@ def create_transcription_local_service(self) -> None: def create_diarization_local_service(self) -> None: """Create a local diarization service.""" if settings.diarization_backend == "longform-diarizer": + logger.info("Using LongFormDiarizeService for diarization.") self.local_services.diarization = LongFormDiarizeService( device=self.device, ) else: + logger.info("Using DiarizeService for diarization.") self.local_services.diarization = DiarizeService( device=self.device, device_index=self.device_index, diff --git a/src/wordcab_transcribe/services/post_processing_service.py b/src/wordcab_transcribe/services/post_processing_service.py index 3bc82d8..ffa7660 100644 --- a/src/wordcab_transcribe/services/post_processing_service.py +++ b/src/wordcab_transcribe/services/post_processing_service.py @@ -362,6 +362,7 @@ def reconstruct_multi_channel_utterances( sentences = [] for speaker, word in transcript_words: start_t, end_t, text = word.start, word.end, word.word + print(speaker, previous_speaker, text) if speaker != previous_speaker: sentences.append(current_sentence) diff --git a/src/wordcab_transcribe/services/transcribe_service.py b/src/wordcab_transcribe/services/transcribe_service.py index 554af54..8f2fb9e 100644 --- a/src/wordcab_transcribe/services/transcribe_service.py +++ b/src/wordcab_transcribe/services/transcribe_service.py @@ -251,26 +251,19 @@ def __call__( _outputs = [segment._asdict() for segment in segments] outputs = TranscriptionOutput(segments=_outputs) - else: - outputs = [] - for audio_index, audio_file in enumerate(audio): - outputs.append( - self.multi_channel( - audio_file, - source_lang=source_lang, - speaker_id=audio_index, - suppress_blank=suppress_blank, - word_timestamps=word_timestamps, - internal_vad=internal_vad, - repetition_penalty=repetition_penalty, - compression_ratio_threshold=compression_ratio_threshold, - log_prob_threshold=log_prob_threshold, - no_speech_threshold=no_speech_threshold, - prompt=prompt, - ) - ) - + outputs = self.multi_channel( + audio, + source_lang=source_lang, + suppress_blank=suppress_blank, + word_timestamps=word_timestamps, + internal_vad=internal_vad, + repetition_penalty=repetition_penalty, + compression_ratio_threshold=compression_ratio_threshold, + log_prob_threshold=log_prob_threshold, + no_speech_threshold=no_speech_threshold, + prompt=prompt, + ) return outputs async def async_live_transcribe( @@ -323,7 +316,7 @@ def live_transcribe( def multi_channel( self, - audio: Union[str, torch.Tensor, TensorShare], + audio_list: List[Union[str, torch.Tensor, TensorShare]], source_lang: str, speaker_id: int, suppress_blank: bool = False, @@ -340,7 +333,7 @@ def multi_channel( Transcribe an audio file using the faster-whisper original pipeline. Args: - audio (Union[str, torch.Tensor, TensorShare]): Audio file path or loaded audio. + audio_list (List[Union[str, torch.Tensor, TensorShare]]): List of audio file paths or audio tensors. source_lang (str): Language of the audio file. speaker_id (int): Speaker ID used in the diarization. suppress_blank (bool): @@ -367,43 +360,105 @@ def multi_channel( Returns: MultiChannelTranscriptionOutput: Multi-channel transcription segments in a list. """ - if isinstance(audio, torch.Tensor): - _audio = audio.numpy() - elif isinstance(audio, TensorShare): - ts = audio.to_tensors(backend=Backend.NUMPY) - _audio = ts["audio"] + outputs = [] - final_segments = [] + if self.model_engine == "faster-whisper": + for speaker_id, audio in enumerate(audio_list): + final_segments = [] + if isinstance(audio, torch.Tensor): + _audio = audio.numpy() + elif isinstance(audio, TensorShare): + ts = audio.to_tensors(backend=Backend.NUMPY) + _audio = ts["audio"] - segments, _ = self.model.transcribe( - _audio, - language=source_lang, - initial_prompt=prompt, - repetition_penalty=repetition_penalty, - compression_ratio_threshold=compression_ratio_threshold, - log_prob_threshold=log_prob_threshold, - no_speech_threshold=no_speech_threshold, - condition_on_previous_text=condition_on_previous_text, - suppress_blank=suppress_blank, - word_timestamps=word_timestamps, - vad_filter=internal_vad, - vad_parameters={ - "threshold": 0.5, - "min_speech_duration_ms": 250, - "min_silence_duration_ms": 100, - "speech_pad_ms": 30, - "window_size_samples": 512, - }, - ) + segments, _ = self.model.transcribe( + _audio, + language=source_lang, + initial_prompt=prompt, + repetition_penalty=repetition_penalty, + compression_ratio_threshold=compression_ratio_threshold, + log_prob_threshold=log_prob_threshold, + no_speech_threshold=no_speech_threshold, + condition_on_previous_text=condition_on_previous_text, + suppress_blank=suppress_blank, + word_timestamps=word_timestamps, + vad_filter=internal_vad, + vad_parameters={ + "threshold": 0.5, + "min_speech_duration_ms": 250, + "min_silence_duration_ms": 100, + "speech_pad_ms": 30, + "window_size_samples": 512, + }, + ) - for segment in segments: - _segment = MultiChannelSegment( - start=segment.start, - end=segment.end, - text=segment.text, - words=[Word(**word._asdict()) for word in segment.words], - speaker=speaker_id, + for segment in segments: + _segment = MultiChannelSegment( + start=segment.start, + end=segment.end, + text=segment.text, + words=[Word(**word._asdict()) for word in segment.words], + speaker=speaker_id, + ) + final_segments.append(_segment) + + outputs.append(final_segments) + elif self.model_engine == "tensorrt-llm": + audio_channels = [] + speaker_ids = [] + + for speaker_id, audio in enumerate(audio_list): + if isinstance(audio, torch.Tensor): + audio = audio.numpy() + elif isinstance(audio, TensorShare): + ts = audio.to_tensors(backend=Backend.NUMPY) + audio = ts["audio"] + audio_channels.append(audio) + speaker_ids.append(speaker_id) + + channels_len = len(audio_channels) + segments_list = self.model.transcribe( + audio_data=audio_channels, + lang_codes=[source_lang] * channels_len, + tasks=["transcribe"] * channels_len, + initial_prompts=[prompt] * channels_len, + batch_size=channels_len, + use_vad=internal_vad, + generate_kwargs={"num_beams": 1}, ) - final_segments.append(_segment) - return MultiChannelTranscriptionOutput(segments=final_segments) + for speaker_id, segments in enumerate(segments_list): + final_segments = [] + + for segment in segments: + segment["words"] = segment.pop("word_timestamps") + for word in segment["words"]: + word["word"] = f" {word['word']}" + word["start"] = round(word["start"], 2) + word["end"] = round(word["end"], 2) + segment["text"] = segment["text"].strip() + + segment["start"] = round(segment.pop("start_time"), 2) + segment["end"] = round(segment.pop("end_time"), 2) + extra = { + "seek": 1, + "id": 1, + "tokens": [1], + "temperature": 0.0, + "avg_logprob": 0.0, + "compression_ratio": 0.0, + "no_speech_prob": 0.0, + } + _segment = Segment(**{**segment, **extra}) + _segment = MultiChannelSegment( + start=_segment.start, + end=_segment.end, + text=_segment.text, + words=[Word(**word) for word in _segment.words], + speaker=speaker_id, + ) + final_segments.append(_segment) + + outputs.append(MultiChannelTranscriptionOutput(segments=final_segments)) + + return outputs