Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated TensorRT-LLM version to latest, enabled dual-channel for tens… #309

Merged
merged 3 commits into from
May 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ __pycache__/
.coverage.*
.DS_Store
.env_dev
.env_*
.nox/
.pytest_cache/
.python-version
Expand Down
32 changes: 20 additions & 12 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 AS runtime

ENV NVIDIA_DRIVER_CAPABILITIES ${NVIDIA_DRIVER_CAPABILITIES:-compute,utility}

ENV PYTHONUNBUFFERED=1

ENV DEBIAN_FRONTEND=noninteractive
ENV MPI4PY_VERSION="3.1.5"
ENV RELEASE_URL="https://github.com/mpi4py/mpi4py/archive/refs/tags/${MPI4PY_VERSION}.tar.gz"

RUN apt update && apt install -y \
RUN apt-get update && apt-get install -y --no-install-recommends \
libsndfile1 \
software-properties-common \
ffmpeg \
Expand All @@ -28,9 +28,6 @@ RUN apt update && apt install -y \
python3-dev \
liblzma-dev \
libsqlite3-dev \
&& rm -rf /var/lib/apt/lists/*

RUN apt update && apt install -y \
libtiff-tools=4.3.0-6ubuntu0.8 \
libtiff5=4.3.0-6ubuntu0.8 \
libgnutls30=3.7.3-4ubuntu1.5 \
Expand All @@ -42,7 +39,8 @@ RUN apt update && apt install -y \
login=1:4.8.1-2ubuntu2.2 \
passwd=1:4.8.1-2ubuntu2.2 \
uidmap=1:4.8.1-2ubuntu2.2 \
binutils=2.38-4ubuntu2.6
binutils=2.38-4ubuntu2.6 \
&& rm -rf /var/lib/apt/lists/*

RUN cd /tmp && \
wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
Expand All @@ -57,20 +55,30 @@ RUN cd /tmp && \
RUN export CUDNN_PATH=$(python -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))') && \
echo 'export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:'${CUDNN_PATH} >> ~/.bashrc

ENV MPI4PY_VERSION="3.1.5"
ENV RELEASE_URL="https://github.com/mpi4py/mpi4py/archive/refs/tags/${MPI4PY_VERSION}.tar.gz"

RUN curl -L ${RELEASE_URL} | tar -zx -C /tmp \
&& sed -i 's/>= 40\\.9\\.0/>= 40.9.0, < 69/g' /tmp/mpi4py-${MPI4PY_VERSION}/pyproject.toml \
&& pip install /tmp/mpi4py-${MPI4PY_VERSION} \
&& rm -rf /tmp/mpi4py*

RUN python -m pip install pip --upgrade

COPY pre_requirements.txt .
COPY requirements.txt .

RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com -r pre_requirements.txt -r requirements.txt

WORKDIR /app

COPY . .
RUN git clone https://github.com/NVIDIA/NeMo.git ./nemo_local && \
cd ./nemo_local && \
git config --global user.email "[email protected]" && \
git config --global user.name "Your Name" && \
git fetch origin pull/9114/head:pr9114 && \
git merge pr9114 && \
pip install -e ".[asr]"

ENV PYTHONPATH="/app/src"

RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com .[runtime]
COPY . .

CMD ["uvicorn", "--host=0.0.0.0", "--port=5001", "src.wordcab_transcribe.main:app"]
13 changes: 13 additions & 0 deletions pre_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
argon2-cffi==23.1.0
fastapi==0.110.0
python-jose[cryptography]==3.3.0
python-multipart==0.0.9
shortuuid==1.0.13
svix==1.21.0
uvicorn==0.29.0
websockets==12.0
tensorrt_llm==0.11.0.dev2024052100
Cython==3.0.10
youtokentome @ git+https://github.com/gburlet/YouTokenToMe.git@dependencies
deepmultilingualpunctuation==1.0.1
pyannote.audio==3.2.0
27 changes: 27 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
aiohttp==3.9.3
aiofiles==23.2.1
boto3
faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz
ffmpeg-python==0.2.0
transformers==4.38.2
librosa==0.10.1
loguru==0.7.2
nltk==3.8.1
numpy==1.26.4
onnxruntime==1.17.1
pandas==2.2.1
pydantic==2.6.4
python-dotenv==1.0.1
tensorshare==0.1.1
torch==2.2.2
torchaudio==2.2.2
wget==3.2.0
yt-dlp==2024.3.10
tiktoken==0.6.0
datasets==2.18.0
kaldialign==0.9.0
openai-whisper==v20231117
soundfile==0.12.1
safetensors==0.4.2
janus==1.0.0
backports.lzma==0.0.14
73 changes: 45 additions & 28 deletions src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
from tensorrt_llm.builder import Builder
from tensorrt_llm.functional import LayerNormPositionType, LayerNormType
from tensorrt_llm.logger import logger
from tensorrt_llm.models import quantize_model
from tensorrt_llm.network import net_guard
from tensorrt_llm.plugin.plugin import ContextFMHAType
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.quantization.quantize_by_modelopt import quantize_model
from weight import load_decoder_weight, load_encoder_weight

MODEL_ENCODER_NAME = "whisper_encoder"
Expand Down Expand Up @@ -316,32 +316,49 @@ def build_decoder(model, args):
)

tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel(
num_layers=model_metadata["n_text_layer"],
num_heads=model_metadata["n_text_head"],
hidden_size=model_metadata["n_text_state"],
ffn_hidden_size=4 * model_metadata["n_text_state"],
encoder_hidden_size=model_metadata["n_text_state"],
encoder_num_heads=model_metadata["n_text_head"],
vocab_size=model_metadata["n_vocab"],
head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
max_position_embeddings=model_metadata["n_text_ctx"],
has_position_embedding=True,
relative_attention=False,
max_distance=0,
num_buckets=0,
has_embedding_layernorm=False,
has_embedding_scale=False,
q_scaling=1.0,
has_attention_qkvo_bias=True,
has_mlp_bias=True,
has_model_final_layernorm=True,
layernorm_eps=1e-5,
layernorm_position=LayerNormPositionType.pre_layernorm,
layernorm_type=LayerNormType.LayerNorm,
hidden_act="gelu",
rescale_before_lm_head=False,
dtype=str_dtype_to_trt(args.dtype),
logits_dtype=str_dtype_to_trt(args.dtype),
tensorrt_llm.models.modeling_utils.PretrainedConfig(
architecture="whisper",
dtype=str_dtype_to_trt(args.dtype),
logits_dtype=str_dtype_to_trt(args.dtype),
vocab_size=model_metadata["n_vocab"],
max_position_embeddings=model_metadata["n_text_ctx"],
hidden_size=model_metadata["n_text_state"],
num_hidden_layers=model_metadata["n_text_layer"],
num_attention_heads=model_metadata["n_text_head"],
num_key_value_heads=model_metadata["n_text_head"],
hidden_act="gelu",
intermediate_size=4 * model_metadata["n_text_state"],
norm_epsilon=1e-5,
position_embedding_type="learned_absolute",
world_size=1,
tp_size=1,
pp_size=1,
gpus_per_node=1,
quantization=tensorrt_llm.models.modeling_utils.QuantConfig(),
head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
num_layers=model_metadata["n_text_layer"],
num_heads=model_metadata["n_text_head"],
ffn_hidden_size=4 * model_metadata["n_text_state"],
encoder_hidden_size=model_metadata["n_text_state"],
encoder_num_heads=model_metadata["n_text_head"],
has_position_embedding=True,
relative_attention=False,
max_distance=0,
num_buckets=0,
has_embedding_layernorm=False,
has_embedding_scale=False,
q_scaling=1.0,
has_attention_qkvo_bias=True,
has_mlp_bias=True,
has_model_final_layernorm=True,
layernorm_eps=1e-5,
layernorm_position=LayerNormPositionType.pre_layernorm,
layernorm_type=LayerNormType.LayerNorm,
rescale_before_lm_head=False,
encoder_head_size=model_metadata["n_text_state"]
// model_metadata["n_text_head"], # Added missing variable
skip_cross_qkv=False,
)
)

if args.use_weight_only:
Expand Down Expand Up @@ -377,7 +394,7 @@ def build_decoder(model, args):
model_metadata["n_audio_ctx"],
)

tensorrt_llm_whisper_decoder(*inputs)
tensorrt_llm_whisper_decoder(**inputs)

if args.debug_mode:
for k, v in tensorrt_llm_whisper_decoder.named_network_outputs():
Expand Down
2 changes: 2 additions & 0 deletions src/wordcab_transcribe/services/asr_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,12 @@ def create_transcription_local_service(self) -> None:
def create_diarization_local_service(self) -> None:
"""Create a local diarization service."""
if settings.diarization_backend == "longform-diarizer":
logger.info("Using LongFormDiarizeService for diarization.")
self.local_services.diarization = LongFormDiarizeService(
device=self.device,
)
else:
logger.info("Using DiarizeService for diarization.")
self.local_services.diarization = DiarizeService(
device=self.device,
device_index=self.device_index,
Expand Down
1 change: 1 addition & 0 deletions src/wordcab_transcribe/services/post_processing_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ def reconstruct_multi_channel_utterances(
sentences = []
for speaker, word in transcript_words:
start_t, end_t, text = word.start, word.end, word.word
print(speaker, previous_speaker, text)

if speaker != previous_speaker:
sentences.append(current_sentence)
Expand Down
Loading
Loading