Skip to content

Commit

Permalink
Merge pull request #290 from Wordcab/285-try-whispers2t-as-potential-…
Browse files Browse the repository at this point in the history
…backend

TensorRT-LLM backend based on WhisperS2T
  • Loading branch information
info-wordcab authored Apr 1, 2024
2 parents fa83363 + 35ee65c commit 419278c
Show file tree
Hide file tree
Showing 92 changed files with 54,837 additions and 501 deletions.
2 changes: 1 addition & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
__pycache__
.coverage
.darglint
.env_dev
.flake8
.gitignore
.nox
Expand All @@ -17,5 +18,4 @@ noxfile.py
test.py
tests
whisper_model
whisper_model_he
wordcab_transcribe/**/__pycache__
39 changes: 30 additions & 9 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# The name of the project, used for API documentation.
PROJECT_NAME="Wordcab Transcribe"
# The version of the project, used for API documentation.
VERSION="0.5.1"
VERSION="0.5.3"
# The description of the project, used for API documentation.
DESCRIPTION="💬 ASR FastAPI server using faster-whisper and Auto-Tuning Spectral Clustering for diarization."
# This API prefix is used for all endpoints in the API outside of the status and cortex endpoints.
Expand All @@ -24,35 +24,56 @@ DEBUG=True
#
# Cloud models:
# The available models are: tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, or large-v2
# You can try different model size, but you should see a trade-off between performance and speed.
# You can try different model size, but you should see a trade-off between performance and speed. Note that the
# "distil" whisper models do not support languages other than English.
#
# Local models:
# You can also link a local folder path to use a custom model. If you do so, you should also mount the folder in the
# docker run command as a volume.
# e.g. WHISPER_MODEL="/app/models/custom"
# docker cmd: -v /path/to/custom/model:/app/models/custom
WHISPER_MODEL="large-v2"
WHISPER_MODEL="large-v3"
# You can specify one of two engines, "faster-whisper" or "tensorrt-llm". At the moment, "faster-whisper" is more
# stable, adjustable, and accurate, while "tensorrt-llm" is faster but less accurate and adjustable.
WHISPER_ENGINE="tensorrt-llm"
# The align model is used for aligning timestamps under the "tensorrt-llm" engine. The available options are:
# "tiny", "small", "base", or "medium".
ALIGN_MODEL="tiny"
# The compute_type parameter is used to control the precision of the model. You can choose between:
# "int8", "int8_float16", "int8_bfloat16", "int16", "float_16", "bfloat16", "float32".
# "int8", "int8_float16", "int8_bfloat16", "int16", "float_16", "bfloat16", "float32" if you're
# whisper engine is set to "fast-whisper". If you're using "tensorrt-llm", keep it at "float16".
# The default value is "float16".
COMPUTE_TYPE="float16"
# The extra_languages parameter is used to control the languages that need an extra model to be loaded.
# You can specify multiple languages separated by a comma. The available languages are: `he` (Hebrew).
# You can specify multiple languages separated by a comma.
EXTRA_LANGUAGES=
# This is used to control the parallelism of the tokenizers, but should be set to False for now.
TOKENIZERS_PARALLELISM=False
#
# --------------------------------------------------- DIARIZATION ---------------------------------------------------- #
#
# The diarization_backend parameter is used to control the diarization model used. The available options are:
# "longform_diarizer" or "default_diarizer". It's suggested to use "default_diarizer" for better stability.
# The "longform_diarizer" is still being developed.
DIARIZATION_BACKEND="default_diarizer"
# In a MSDD (Multiscale Diarization Decoder) model, the diarization model is trained on multiple window lengths.
# The window_lengths are specified in seconds, and separated by a comma. If not specified, the default value will
# be "1.5, 1.25, 1.0, 0.75, 0.5".
WINDOW_LENGTHS="1.5,1.25,1.0,0.75,0.5"
WINDOW_LENGTHS="2.0,1.5,1.0,0.75,0.5"
# The shift_lengths are specified in seconds, and separated by a comma. If not specified, the default value will
# be "0.75, 0.625, 0.5, 0.375, 0.25".
SHIFT_LENGTHS="0.75,0.625,0.5,0.375,0.25"
SHIFT_LENGTHS="1.0,0.75,0.625,0.5,0.25"
# The multiscale_weights are float values separated by a comma. If not specified, the default value will be
# "1.0, 1.0, 1.0, 1.0, 1.0".
MULTISCALE_WEIGHTS="1.0,1.0,1.0,1.0,1.0"
#
# --------------------------------------------------- POST-PROCESSING------------------------------------------------- #
#
# This parameter is used to control the punctuation-based alignment. If set to True, the predicted punctuation
# will be used to adjust speaker diarization. The default value is True, but note this comes with a performance
# tradeoff.
ENABLE_PUNCTUATION_BASED_ALIGNMENT=False
#
# ---------------------------------------------- ASR TYPE CONFIGURATION ---------------------------------------------- #
#
# The asr_type parameter is used to control the type of ASR used. The available options are: `async` or `live`.
Expand Down Expand Up @@ -82,7 +103,7 @@ USERNAME="admin"
PASSWORD="admin"
# This openssl_key parameter is used to control the key used to encrypt the access tokens.
# You should absolutely change this value before deploying the API in production.
OPENSSL_KEY="0123456789abcdefghijklmnopqrstuvwyz" # <--- CHANGE ABSOLUTELY THIS VALUE
OPENSSL_KEY="0123456789abcdefghijklmnopqrstuvwyz" # <--- CHANGE THIS VALUE
# This openssl_algorithm parameter is used to control the algorithm used to encrypt the access tokens.
# You should in most case not change this value.
OPENSSL_ALGORITHM="HS256"
Expand All @@ -107,7 +128,7 @@ SVIX_APP_ID=
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_STORAGE_BUCKET_NAME=
AWS_S3_REGION_NAME=
AWS_REGION_NAME=
#
# -------------------------------------------------- REMOTE SERVERS -------------------------------------------------- #
# The remote servers configuration is used to control the number of servers used to process the requests if you don't
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ __pycache__/
.coverage
.coverage.*
.DS_Store
.env_dev
.nox/
.pytest_cache/
.python-version
Expand Down
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ repos:
rev: "v4.3.0"
hooks:
- id: check-added-large-files
args: [--maxkb=2000]
- id: check-toml
- id: check-yaml

Expand Down
70 changes: 62 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,17 +1,71 @@
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 AS runtime

ENV NVIDIA_DRIVER_CAPABILITIES ${NVIDIA_DRIVER_CAPABILITIES:-compute,utility}

ENV PYTHONUNBUFFERED=1

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y \
git \
curl \
ffmpeg \

RUN apt update && apt install -y \
libsndfile1 \
software-properties-common \
python3-pip
ffmpeg \
build-essential \
ca-certificates \
ccache \
cmake \
gnupg2 \
wget \
git \
curl \
gdb \
openmpi-bin \
libopenmpi-dev \
libffi-dev \
libssl-dev \
python3-pip \
libbz2-dev \
python3-dev \
liblzma-dev \
&& rm -rf /var/lib/apt/lists/*

ENV PYTHONUNBUFFERED=1
RUN python3 -m pip install pip --upgrade \
&& python3 -m pip install hatch
RUN apt update && apt install -y \
libtiff-tools=4.3.0-6ubuntu0.8 \
libtiff5=4.3.0-6ubuntu0.8 \
libgnutls30=3.7.3-4ubuntu1.4 \
openssl=3.0.2-0ubuntu1.15 \
libpam-modules=1.4.0-11ubuntu2.4 \
libpam-modules-bin=1.4.0-11ubuntu2.4 \
libpam-runtime=1.4.0-11ubuntu2.4 \
libpam0g=1.4.0-11ubuntu2.4 \
login=1:4.8.1-2ubuntu2.2 \
passwd=1:4.8.1-2ubuntu2.2 \
uidmap=1:4.8.1-2ubuntu2.2 \
binutils=2.38-4ubuntu2.6

RUN cd /tmp && \
wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
tar -xvf Python-3.10.12.tgz && \
cd Python-3.10.12 && \
./configure --enable-optimizations --with-ssl && \
make && make install && \
cd .. && rm -r Python-3.10.12 && \
ln -s /usr/local/bin/python3 /usr/local/bin/python && \
ln -s /usr/local/bin/pip3 /usr/local/bin/pip

RUN export CUDNN_PATH=$(python -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))') && \
echo 'export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:'${CUDNN_PATH} >> ~/.bashrc

ENV MPI4PY_VERSION="3.1.5"
ENV RELEASE_URL="https://github.com/mpi4py/mpi4py/archive/refs/tags/${MPI4PY_VERSION}.tar.gz"

RUN curl -L ${RELEASE_URL} | tar -zx -C /tmp \
&& sed -i 's/>= 40\\.9\\.0/>= 40.9.0, < 69/g' /tmp/mpi4py-${MPI4PY_VERSION}/pyproject.toml \
&& pip install /tmp/mpi4py-${MPI4PY_VERSION} \
&& rm -rf /tmp/mpi4py*

RUN python -m pip install pip --upgrade \
&& python -m pip install hatch

WORKDIR /app
COPY . .
Expand Down
Loading

0 comments on commit 419278c

Please sign in to comment.