Skip to content

Commit

Permalink
Merge pull request #311 from Wordcab/286-integrate-new-nemo-diarizati…
Browse files Browse the repository at this point in the history
…on-changes

Adding fixes to make wordcab-transcribe easier to run, adding support for dual channel audio
  • Loading branch information
aleksandr-smechov authored Jun 12, 2024
2 parents f5720f5 + e77badf commit 2cda3f0
Show file tree
Hide file tree
Showing 21 changed files with 292 additions and 128 deletions.
9 changes: 7 additions & 2 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ WHISPER_MODEL="large-v3"
# You can specify one of two engines, "faster-whisper" or "tensorrt-llm". At the moment, "faster-whisper" is more
# stable, adjustable, and accurate, while "tensorrt-llm" is faster but less accurate and adjustable.
WHISPER_ENGINE="tensorrt-llm"
# This helps adjust some build during the conversion of the Whisper model to TensorRT. If you change this, be sure to
# it in pre_requirements.txt. The only available options are "0.9.0.dev2024032600" and "0.11.0.dev2024052100".
# Note that version "0.11.0.dev2024052100" is not compatible with T4 or V100 GPUs.
TENSORRT_LLM_VERSION="0.9.0.dev2024032600"
# The align model is used for aligning timestamps under the "tensorrt-llm" engine. The available options are:
# "tiny", "small", "base", or "medium".
ALIGN_MODEL="tiny"
Expand All @@ -60,7 +64,7 @@ TOKENIZERS_PARALLELISM=False
# The diarization_backend parameter is used to control the diarization model used. The available options are:
# "longform-diarizer" or "default-diarizer". It's suggested to use "default-diarizer" for better stability.
# The "longform-diarizer" is still being developed.
DIARIZATION_BACKEND="default-diarizer"
DIARIZATION_BACKEND="longform-diarizer"
# In a MSDD (Multiscale Diarization Decoder) model, the diarization model is trained on multiple window lengths.
# The window_lengths are specified in seconds, and separated by a comma. If not specified, the default value will
# be "1.5, 1.25, 1.0, 0.75, 0.5".
Expand Down Expand Up @@ -97,7 +101,7 @@ ASR_TYPE="async"
#
# Include the cortex endpoint in the API. This endpoint is used to process audio files from the Cortex API.
# Use this only if you deploy the API using Cortex and Kubernetes.
CORTEX_ENDPOINT=True
CORTEX_ENDPOINT=False
#
# ---------------------------------------- API AUTHENTICATION CONFIGURATION ------------------------------------------ #
# The API authentication is used to control the access to the API endpoints.
Expand Down Expand Up @@ -130,6 +134,7 @@ SVIX_APP_ID=
#
# ----------------------------------------------- AWS CONFIGURATION ------------------------------------------------- #
#
SEND_RESULTS_TO_S3=False
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_STORAGE_BUCKET_NAME=
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,7 @@ test.ipynb
test.py
whisper_model
whisper_model_he
storage
nemo_storage
nemo_local
error.log
22 changes: 0 additions & 22 deletions .pre-commit-config.yaml

This file was deleted.

1 change: 1 addition & 0 deletions error.log
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/app/temp_outputs/mono_file.wav:[Errno 2] No such file or directory: '/app/temp_outputs/mono_file.wav'
1 change: 1 addition & 0 deletions nemo_local
Submodule nemo_local added at 5703d9
1 change: 1 addition & 0 deletions nemo_storage/infer_manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"audio_filepath": "/app/temp_outputs/mono_file.wav", "offset": 0, "duration": null, "label": "infer", "text": "-", "rttm_filepath": null, "uem_filepath": null}
2 changes: 1 addition & 1 deletion pre_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ shortuuid==1.0.13
svix==1.21.0
uvicorn==0.29.0
websockets==12.0
tensorrt_llm==0.11.0.dev2024052100
tensorrt_llm==0.9.0.dev2024032600
Cython==3.0.10
youtokentome @ git+https://github.com/gburlet/YouTokenToMe.git@dependencies
deepmultilingualpunctuation==1.0.1
Expand Down
4 changes: 3 additions & 1 deletion src/wordcab_transcribe/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class Settings:
# Cortex configuration
cortex_api_key: str
# AWS configuration
send_results_to_s3: bool
aws_access_key_id: str
aws_secret_access_key: str
aws_storage_bucket_name: str
Expand Down Expand Up @@ -137,7 +138,7 @@ def align_model_compatibility_check(cls, value: str): # noqa: B902, N805
"""Check that the whisper engine is compatible."""
if value.lower() not in ["tiny", "small", "base", "medium"]:
raise ValueError(
"The whisper engine must be one of `tiny`, `small`, `base`, or"
"The align model must be one of `tiny`, `small`, `base`, or"
" `medium`."
)

Expand Down Expand Up @@ -348,6 +349,7 @@ def __post_init__(self):
# Cortex configuration
cortex_api_key=getenv("WORDCAB_TRANSCRIBE_API_KEY", ""),
# AWS configuration
send_results_to_s3=getenv("SEND_RESULTS_TO_S3", False),
aws_access_key_id=getenv("AWS_ACCESS_KEY_ID", ""),
aws_secret_access_key=getenv("AWS_SECRET_ACCESS_KEY", ""),
aws_storage_bucket_name=getenv("AWS_STORAGE_BUCKET_NAME", ""),
Expand Down
101 changes: 71 additions & 30 deletions src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,33 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import time
import argparse

import tensorrt_llm
import torch
from loguru import logger

import tensorrt_llm
from tensorrt_llm import str_dtype_to_torch, str_dtype_to_trt
from tensorrt_llm.builder import Builder
from tensorrt_llm.functional import LayerNormPositionType, LayerNormType
from tensorrt_llm.logger import logger
from tensorrt_llm.network import net_guard
from tensorrt_llm.plugin.plugin import ContextFMHAType
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.quantization.quantize_by_modelopt import quantize_model
from weight import load_decoder_weight, load_encoder_weight

MODEL_ENCODER_NAME = "whisper_encoder"
MODEL_DECODER_NAME = "whisper_decoder"

TENSORRT_LLM_VERSION = os.getenv("TENSORRT_LLM_VERSION")
if "0.9.0" in TENSORRT_LLM_VERSION:
from tensorrt_llm.models import quantize_model
elif "0.11.0" in TENSORRT_LLM_VERSION:
from tensorrt_llm.quantization.quantize_by_modelopt import quantize_model
else:
raise ValueError(f"Unsupported version of tensorrt_llm: {TENSORRT_LLM_VERSION}")

def get_engine_name(model, dtype, tp_size=1, rank=0):
return "{}_{}_tp{}_rank{}.engine".format(model, dtype, tp_size, rank)
Expand Down Expand Up @@ -79,7 +87,7 @@ def parse_arguments():
parser.add_argument("--quantize_dir", type=str, default="quantize/1-gpu")
parser.add_argument("--dtype", type=str, default="float16", choices=["float16"])
parser.add_argument("--log_level", type=str, default="info")
parser.add_argument("--max_batch_size", type=int, default=24)
parser.add_argument("--max_batch_size", type=int, default=16)
parser.add_argument("--max_input_len", type=int, default=4)
parser.add_argument("--max_output_len", type=int, default=448)
parser.add_argument("--max_beam_width", type=int, default=1)
Expand Down Expand Up @@ -315,32 +323,63 @@ def build_decoder(model, args):
int8=args.quant_mode.has_act_or_weight_quant(),
)

tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel(
tensorrt_llm.models.modeling_utils.PretrainedConfig(
architecture="whisper",
dtype=str_dtype_to_trt(args.dtype),
logits_dtype=str_dtype_to_trt(args.dtype),
vocab_size=model_metadata["n_vocab"],
max_position_embeddings=model_metadata["n_text_ctx"],
hidden_size=model_metadata["n_text_state"],
num_hidden_layers=model_metadata["n_text_layer"],
num_attention_heads=model_metadata["n_text_head"],
num_key_value_heads=model_metadata["n_text_head"],
hidden_act="gelu",
intermediate_size=4 * model_metadata["n_text_state"],
norm_epsilon=1e-5,
position_embedding_type="learned_absolute",
world_size=1,
tp_size=1,
pp_size=1,
gpus_per_node=1,
quantization=tensorrt_llm.models.modeling_utils.QuantConfig(),
head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
try:
tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel(
tensorrt_llm.models.modeling_utils.PretrainedConfig(
architecture="whisper",
dtype=str_dtype_to_trt(args.dtype),
logits_dtype=str_dtype_to_trt(args.dtype),
vocab_size=model_metadata["n_vocab"],
max_position_embeddings=model_metadata["n_text_ctx"],
hidden_size=model_metadata["n_text_state"],
num_hidden_layers=model_metadata["n_text_layer"],
num_attention_heads=model_metadata["n_text_head"],
num_key_value_heads=model_metadata["n_text_head"],
hidden_act="gelu",
intermediate_size=4 * model_metadata["n_text_state"],
norm_epsilon=1e-5,
position_embedding_type="learned_absolute",
world_size=1,
tp_size=1,
pp_size=1,
gpus_per_node=1,
quantization=tensorrt_llm.models.modeling_utils.QuantConfig(),
head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
num_layers=model_metadata["n_text_layer"],
num_heads=model_metadata["n_text_head"],
ffn_hidden_size=4 * model_metadata["n_text_state"],
encoder_hidden_size=model_metadata["n_text_state"],
encoder_num_heads=model_metadata["n_text_head"],
has_position_embedding=True,
relative_attention=False,
max_distance=0,
num_buckets=0,
has_embedding_layernorm=False,
has_embedding_scale=False,
q_scaling=1.0,
has_attention_qkvo_bias=True,
has_mlp_bias=True,
has_model_final_layernorm=True,
layernorm_eps=1e-5,
layernorm_position=LayerNormPositionType.pre_layernorm,
layernorm_type=LayerNormType.LayerNorm,
rescale_before_lm_head=False,
encoder_head_size=model_metadata["n_text_state"]
// model_metadata["n_text_head"], # Added missing variable
skip_cross_qkv=False,
)
)
except:
tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel(
num_layers=model_metadata["n_text_layer"],
num_heads=model_metadata["n_text_head"],
hidden_size=model_metadata["n_text_state"],
ffn_hidden_size=4 * model_metadata["n_text_state"],
encoder_hidden_size=model_metadata["n_text_state"],
encoder_num_heads=model_metadata["n_text_head"],
vocab_size=model_metadata["n_vocab"],
head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
max_position_embeddings=model_metadata["n_text_ctx"],
has_position_embedding=True,
relative_attention=False,
max_distance=0,
Expand All @@ -354,12 +393,11 @@ def build_decoder(model, args):
layernorm_eps=1e-5,
layernorm_position=LayerNormPositionType.pre_layernorm,
layernorm_type=LayerNormType.LayerNorm,
hidden_act="gelu",
rescale_before_lm_head=False,
encoder_head_size=model_metadata["n_text_state"]
// model_metadata["n_text_head"], # Added missing variable
skip_cross_qkv=False,
dtype=str_dtype_to_trt(args.dtype),
logits_dtype=str_dtype_to_trt(args.dtype),
)
)

if args.use_weight_only:
tensorrt_llm_whisper_decoder = quantize_model(
Expand Down Expand Up @@ -394,7 +432,10 @@ def build_decoder(model, args):
model_metadata["n_audio_ctx"],
)

tensorrt_llm_whisper_decoder(**inputs)
if "0.9.0" in TENSORRT_LLM_VERSION:
tensorrt_llm_whisper_decoder(*inputs)
else:
tensorrt_llm_whisper_decoder(**inputs)

if args.debug_mode:
for k, v in tensorrt_llm_whisper_decoder.named_network_outputs():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,31 @@
}


TRT_BUILD_MAX_OUTPUT_LEN = os.getenv("TRT_BUILD_MAX_OUTPUT_LEN", None)
TRT_BUILD_MAX_BEAM_WIDTH = os.getenv("TRT_BUILD_MAX_BEAM_WIDTH", None)
if not TRT_BUILD_MAX_OUTPUT_LEN:
TRT_BUILD_MAX_OUTPUT_LEN = 448
else:
TRT_BUILD_MAX_OUTPUT_LEN = int(TRT_BUILD_MAX_OUTPUT_LEN)
logger.info(f"TRT_BUILD_MAX_OUTPUT_LEN: {TRT_BUILD_MAX_OUTPUT_LEN}")

if not TRT_BUILD_MAX_BEAM_WIDTH:
TRT_BUILD_MAX_BEAM_WIDTH = 1
else:
TRT_BUILD_MAX_BEAM_WIDTH = int(TRT_BUILD_MAX_BEAM_WIDTH)
logger.info(f"TRT_BUILD_MAX_BEAM_WIDTH: {TRT_BUILD_MAX_BEAM_WIDTH}")


def build_whisper_trt_model(
output_dir,
use_gpt_attention_plugin=True,
use_gemm_plugin=True,
use_bert_attention_plugin=True,
enable_context_fmha=True,
use_weight_only=False,
model_name="distil-large-v2",
max_output_len=TRT_BUILD_MAX_OUTPUT_LEN,
max_beam_width=TRT_BUILD_MAX_BEAM_WIDTH,
model_name="large-v3",
):
"""
Build a Whisper model using the specified configuration.
Expand Down Expand Up @@ -158,6 +175,10 @@ def build_whisper_trt_model(
command.append("--enable_context_fmha")
if use_weight_only:
command.append("--use_weight_only")
if max_output_len:
command.extend(["--max_output_len", str(max_output_len)])
if max_beam_width:
command.extend(["--max_beam_width", str(max_beam_width)])

try:
subprocess.run(command, check=True)
Expand Down
8 changes: 5 additions & 3 deletions src/wordcab_transcribe/engines/tensorrt_llm/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ def exact_div(x, y):
"best_of": 5,
"patience": 1,
"length_penalty": 1,
"repetition_penalty": 1.01,
"repetition_penalty": 1.05,
"no_repeat_ngram_size": 0,
"compression_ratio_threshold": 2.4,
"log_prob_threshold": -1.0,
"no_speech_threshold": 0.3,
"no_speech_threshold": 0.4,
"prefix": None,
"suppress_blank": False,
"suppress_tokens": [-1],
Expand All @@ -61,7 +61,7 @@ def exact_div(x, y):
"best_of": 1,
"patience": 2,
"length_penalty": 1,
"repetition_penalty": 1.01,
"repetition_penalty": 1.05,
"no_repeat_ngram_size": 0,
"compression_ratio_threshold": 2.4,
"log_prob_threshold": -1.0,
Expand Down Expand Up @@ -224,7 +224,9 @@ def align_words(
start_seq_wise_req[_sot_seq] = [_idx]

token_alignments = [[] for _ in seg_metadata]

for start_seq, req_idx in start_seq_wise_req.items():

res = self.align_model.align(
ctranslate2.StorageView.from_array(features[req_idx]),
start_sequence=list(start_seq),
Expand Down
4 changes: 2 additions & 2 deletions src/wordcab_transcribe/engines/tensorrt_llm/trt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ def get_session(self, engine_dir, runtime_mapping, debug_mode=False):

# TODO: Make dynamic max_batch_size and max_beam_width
decoder_model_config = ModelConfig(
max_batch_size=24,
max_beam_width=1,
max_batch_size=16,
max_beam_width=5,
num_heads=self.decoder_config["num_heads"],
num_kv_heads=self.decoder_config["num_heads"],
hidden_size=self.decoder_config["hidden_size"],
Expand Down
1 change: 1 addition & 0 deletions src/wordcab_transcribe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ class BaseRequest(BaseModel):
diarization: bool = False
batch_size: int = 1
source_lang: str = "en"
num_beams: int = 1
timestamps: Timestamps = Timestamps.seconds
vocab: Union[List[str], None] = None
word_timestamps: bool = False
Expand Down
Loading

0 comments on commit 2cda3f0

Please sign in to comment.