Skip to content

Commit

Permalink
Updated TensorRT-LLM version to latest, enabled dual-channel for tens…
Browse files Browse the repository at this point in the history
…orrt-llm backend
  • Loading branch information
Aleks committed May 26, 2024
1 parent b2f0d3b commit b022cba
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 41 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ __pycache__/
.coverage.*
.DS_Store
.env_dev
.env_*
.nox/
.pytest_cache/
.python-version
Expand Down
32 changes: 20 additions & 12 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 AS runtime

ENV NVIDIA_DRIVER_CAPABILITIES ${NVIDIA_DRIVER_CAPABILITIES:-compute,utility}

ENV PYTHONUNBUFFERED=1

ENV DEBIAN_FRONTEND=noninteractive
ENV MPI4PY_VERSION="3.1.5"
ENV RELEASE_URL="https://github.com/mpi4py/mpi4py/archive/refs/tags/${MPI4PY_VERSION}.tar.gz"

RUN apt update && apt install -y \
RUN apt-get update && apt-get install -y --no-install-recommends \
libsndfile1 \
software-properties-common \
ffmpeg \
Expand All @@ -28,9 +28,6 @@ RUN apt update && apt install -y \
python3-dev \
liblzma-dev \
libsqlite3-dev \
&& rm -rf /var/lib/apt/lists/*

RUN apt update && apt install -y \
libtiff-tools=4.3.0-6ubuntu0.8 \
libtiff5=4.3.0-6ubuntu0.8 \
libgnutls30=3.7.3-4ubuntu1.5 \
Expand All @@ -42,7 +39,8 @@ RUN apt update && apt install -y \
login=1:4.8.1-2ubuntu2.2 \
passwd=1:4.8.1-2ubuntu2.2 \
uidmap=1:4.8.1-2ubuntu2.2 \
binutils=2.38-4ubuntu2.6
binutils=2.38-4ubuntu2.6 \
&& rm -rf /var/lib/apt/lists/*

RUN cd /tmp && \
wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
Expand All @@ -57,20 +55,30 @@ RUN cd /tmp && \
RUN export CUDNN_PATH=$(python -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))') && \
echo 'export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:'${CUDNN_PATH} >> ~/.bashrc

ENV MPI4PY_VERSION="3.1.5"
ENV RELEASE_URL="https://github.com/mpi4py/mpi4py/archive/refs/tags/${MPI4PY_VERSION}.tar.gz"

RUN curl -L ${RELEASE_URL} | tar -zx -C /tmp \
&& sed -i 's/>= 40\\.9\\.0/>= 40.9.0, < 69/g' /tmp/mpi4py-${MPI4PY_VERSION}/pyproject.toml \
&& pip install /tmp/mpi4py-${MPI4PY_VERSION} \
&& rm -rf /tmp/mpi4py*

RUN python -m pip install pip --upgrade

COPY pre_requirements.txt .
COPY requirements.txt .

RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com -r pre_requirements.txt -r requirements.txt

WORKDIR /app

COPY . .
RUN git clone https://github.com/NVIDIA/NeMo.git ./nemo_local && \
cd ./nemo_local && \
git config --global user.email "[email protected]" && \
git config --global user.name "Your Name" && \
git fetch origin pull/9114/head:pr9114 && \
git merge pr9114 && \
pip install -e ".[asr]"

ENV PYTHONPATH="/app/src"

RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com .[runtime]
COPY . .

CMD ["uvicorn", "--host=0.0.0.0", "--port=5001", "src.wordcab_transcribe.main:app"]
13 changes: 13 additions & 0 deletions pre_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
argon2-cffi==23.1.0
fastapi==0.110.0
python-jose[cryptography]==3.3.0
python-multipart==0.0.9
shortuuid==1.0.13
svix==1.21.0
uvicorn==0.29.0
websockets==12.0
tensorrt_llm==0.11.0.dev2024052100
Cython==3.0.10
youtokentome @ git+https://github.com/gburlet/YouTokenToMe.git@dependencies
deepmultilingualpunctuation==1.0.1
pyannote.audio==3.2.0
27 changes: 27 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
aiohttp==3.9.3
aiofiles==23.2.1
boto3
faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz
ffmpeg-python==0.2.0
transformers==4.38.2
librosa==0.10.1
loguru==0.7.2
nltk==3.8.1
numpy==1.26.4
onnxruntime==1.17.1
pandas==2.2.1
pydantic==2.6.4
python-dotenv==1.0.1
tensorshare==0.1.1
torch==2.2.2
torchaudio==2.2.2
wget==3.2.0
yt-dlp==2024.3.10
tiktoken==0.6.0
datasets==2.18.0
kaldialign==0.9.0
openai-whisper==v20231117
soundfile==0.12.1
safetensors==0.4.2
janus==1.0.0
backports.lzma==0.0.14
73 changes: 45 additions & 28 deletions src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
from tensorrt_llm.builder import Builder
from tensorrt_llm.functional import LayerNormPositionType, LayerNormType
from tensorrt_llm.logger import logger
from tensorrt_llm.models import quantize_model
from tensorrt_llm.network import net_guard
from tensorrt_llm.plugin.plugin import ContextFMHAType
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.quantization.quantize_by_modelopt import quantize_model
from weight import load_decoder_weight, load_encoder_weight

MODEL_ENCODER_NAME = "whisper_encoder"
Expand Down Expand Up @@ -316,32 +316,49 @@ def build_decoder(model, args):
)

tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel(
num_layers=model_metadata["n_text_layer"],
num_heads=model_metadata["n_text_head"],
hidden_size=model_metadata["n_text_state"],
ffn_hidden_size=4 * model_metadata["n_text_state"],
encoder_hidden_size=model_metadata["n_text_state"],
encoder_num_heads=model_metadata["n_text_head"],
vocab_size=model_metadata["n_vocab"],
head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
max_position_embeddings=model_metadata["n_text_ctx"],
has_position_embedding=True,
relative_attention=False,
max_distance=0,
num_buckets=0,
has_embedding_layernorm=False,
has_embedding_scale=False,
q_scaling=1.0,
has_attention_qkvo_bias=True,
has_mlp_bias=True,
has_model_final_layernorm=True,
layernorm_eps=1e-5,
layernorm_position=LayerNormPositionType.pre_layernorm,
layernorm_type=LayerNormType.LayerNorm,
hidden_act="gelu",
rescale_before_lm_head=False,
dtype=str_dtype_to_trt(args.dtype),
logits_dtype=str_dtype_to_trt(args.dtype),
tensorrt_llm.models.modeling_utils.PretrainedConfig(
architecture="whisper",
dtype=str_dtype_to_trt(args.dtype),
logits_dtype=str_dtype_to_trt(args.dtype),
vocab_size=model_metadata["n_vocab"],
max_position_embeddings=model_metadata["n_text_ctx"],
hidden_size=model_metadata["n_text_state"],
num_hidden_layers=model_metadata["n_text_layer"],
num_attention_heads=model_metadata["n_text_head"],
num_key_value_heads=model_metadata["n_text_head"],
hidden_act="gelu",
intermediate_size=4 * model_metadata["n_text_state"],
norm_epsilon=1e-5,
position_embedding_type="learned_absolute",
world_size=1,
tp_size=1,
pp_size=1,
gpus_per_node=1,
quantization=tensorrt_llm.models.modeling_utils.QuantConfig(),
head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
num_layers=model_metadata["n_text_layer"],
num_heads=model_metadata["n_text_head"],
ffn_hidden_size=4 * model_metadata["n_text_state"],
encoder_hidden_size=model_metadata["n_text_state"],
encoder_num_heads=model_metadata["n_text_head"],
has_position_embedding=True,
relative_attention=False,
max_distance=0,
num_buckets=0,
has_embedding_layernorm=False,
has_embedding_scale=False,
q_scaling=1.0,
has_attention_qkvo_bias=True,
has_mlp_bias=True,
has_model_final_layernorm=True,
layernorm_eps=1e-5,
layernorm_position=LayerNormPositionType.pre_layernorm,
layernorm_type=LayerNormType.LayerNorm,
rescale_before_lm_head=False,
encoder_head_size=model_metadata["n_text_state"]
// model_metadata["n_text_head"], # Added missing variable
skip_cross_qkv=False,
)
)

if args.use_weight_only:
Expand Down Expand Up @@ -377,7 +394,7 @@ def build_decoder(model, args):
model_metadata["n_audio_ctx"],
)

tensorrt_llm_whisper_decoder(*inputs)
tensorrt_llm_whisper_decoder(**inputs)

if args.debug_mode:
for k, v in tensorrt_llm_whisper_decoder.named_network_outputs():
Expand Down
2 changes: 2 additions & 0 deletions src/wordcab_transcribe/services/asr_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,12 @@ def create_transcription_local_service(self) -> None:
def create_diarization_local_service(self) -> None:
"""Create a local diarization service."""
if settings.diarization_backend == "longform-diarizer":
logger.info("Using LongFormDiarizeService for diarization.")
self.local_services.diarization = LongFormDiarizeService(
device=self.device,
)
else:
logger.info("Using DiarizeService for diarization.")
self.local_services.diarization = DiarizeService(
device=self.device,
device_index=self.device_index,
Expand Down
1 change: 1 addition & 0 deletions src/wordcab_transcribe/services/post_processing_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ def reconstruct_multi_channel_utterances(
sentences = []
for speaker, word in transcript_words:
start_t, end_t, text = word.start, word.end, word.word
print(speaker, previous_speaker, text)

if speaker != previous_speaker:
sentences.append(current_sentence)
Expand Down
4 changes: 3 additions & 1 deletion src/wordcab_transcribe/services/transcribe_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,6 @@ def __call__(

_outputs = [segment._asdict() for segment in segments]
outputs = TranscriptionOutput(segments=_outputs)

else:
outputs = self.multi_channel(
audio,
Expand Down Expand Up @@ -319,6 +318,7 @@ def multi_channel(
self,
audio_list: List[Union[str, torch.Tensor, TensorShare]],
source_lang: str,
speaker_id: int,
suppress_blank: bool = False,
word_timestamps: bool = True,
internal_vad: bool = True,
Expand All @@ -335,6 +335,7 @@ def multi_channel(
Args:
audio_list (List[Union[str, torch.Tensor, TensorShare]]): List of audio file paths or audio tensors.
source_lang (str): Language of the audio file.
speaker_id (int): Speaker ID used in the diarization.
suppress_blank (bool):
Whether to suppress blank at the beginning of the sampling.
word_timestamps (bool):
Expand Down Expand Up @@ -436,6 +437,7 @@ def multi_channel(
word["start"] = round(word["start"], 2)
word["end"] = round(word["end"], 2)
segment["text"] = segment["text"].strip()

segment["start"] = round(segment.pop("start_time"), 2)
segment["end"] = round(segment.pop("end_time"), 2)
extra = {
Expand Down

0 comments on commit b022cba

Please sign in to comment.