Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding fixes to make wordcab-transcribe easier to run, adding support for dual channel audio #311

Merged
merged 1 commit into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ WHISPER_MODEL="large-v3"
# You can specify one of two engines, "faster-whisper" or "tensorrt-llm". At the moment, "faster-whisper" is more
# stable, adjustable, and accurate, while "tensorrt-llm" is faster but less accurate and adjustable.
WHISPER_ENGINE="tensorrt-llm"
# This helps adjust some build during the conversion of the Whisper model to TensorRT. If you change this, be sure to
# it in pre_requirements.txt. The only available options are "0.9.0.dev2024032600" and "0.11.0.dev2024052100".
# Note that version "0.11.0.dev2024052100" is not compatible with T4 or V100 GPUs.
TENSORRT_LLM_VERSION="0.9.0.dev2024032600"
# The align model is used for aligning timestamps under the "tensorrt-llm" engine. The available options are:
# "tiny", "small", "base", or "medium".
ALIGN_MODEL="tiny"
Expand All @@ -60,7 +64,7 @@ TOKENIZERS_PARALLELISM=False
# The diarization_backend parameter is used to control the diarization model used. The available options are:
# "longform-diarizer" or "default-diarizer". It's suggested to use "default-diarizer" for better stability.
# The "longform-diarizer" is still being developed.
DIARIZATION_BACKEND="default-diarizer"
DIARIZATION_BACKEND="longform-diarizer"
# In a MSDD (Multiscale Diarization Decoder) model, the diarization model is trained on multiple window lengths.
# The window_lengths are specified in seconds, and separated by a comma. If not specified, the default value will
# be "1.5, 1.25, 1.0, 0.75, 0.5".
Expand Down Expand Up @@ -97,7 +101,7 @@ ASR_TYPE="async"
#
# Include the cortex endpoint in the API. This endpoint is used to process audio files from the Cortex API.
# Use this only if you deploy the API using Cortex and Kubernetes.
CORTEX_ENDPOINT=True
CORTEX_ENDPOINT=False
#
# ---------------------------------------- API AUTHENTICATION CONFIGURATION ------------------------------------------ #
# The API authentication is used to control the access to the API endpoints.
Expand Down Expand Up @@ -130,6 +134,7 @@ SVIX_APP_ID=
#
# ----------------------------------------------- AWS CONFIGURATION ------------------------------------------------- #
#
SEND_RESULTS_TO_S3=False
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_STORAGE_BUCKET_NAME=
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,7 @@ test.ipynb
test.py
whisper_model
whisper_model_he
storage
nemo_storage
nemo_local
error.log
22 changes: 0 additions & 22 deletions .pre-commit-config.yaml

This file was deleted.

1 change: 1 addition & 0 deletions error.log
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/app/temp_outputs/mono_file.wav:[Errno 2] No such file or directory: '/app/temp_outputs/mono_file.wav'
1 change: 1 addition & 0 deletions nemo_local
Submodule nemo_local added at 5703d9
1 change: 1 addition & 0 deletions nemo_storage/infer_manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"audio_filepath": "/app/temp_outputs/mono_file.wav", "offset": 0, "duration": null, "label": "infer", "text": "-", "rttm_filepath": null, "uem_filepath": null}
2 changes: 1 addition & 1 deletion pre_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ shortuuid==1.0.13
svix==1.21.0
uvicorn==0.29.0
websockets==12.0
tensorrt_llm==0.11.0.dev2024052100
tensorrt_llm==0.9.0.dev2024032600
Cython==3.0.10
youtokentome @ git+https://github.com/gburlet/YouTokenToMe.git@dependencies
deepmultilingualpunctuation==1.0.1
Expand Down
4 changes: 3 additions & 1 deletion src/wordcab_transcribe/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class Settings:
# Cortex configuration
cortex_api_key: str
# AWS configuration
send_results_to_s3: bool
aws_access_key_id: str
aws_secret_access_key: str
aws_storage_bucket_name: str
Expand Down Expand Up @@ -137,7 +138,7 @@ def align_model_compatibility_check(cls, value: str): # noqa: B902, N805
"""Check that the whisper engine is compatible."""
if value.lower() not in ["tiny", "small", "base", "medium"]:
raise ValueError(
"The whisper engine must be one of `tiny`, `small`, `base`, or"
"The align model must be one of `tiny`, `small`, `base`, or"
" `medium`."
)

Expand Down Expand Up @@ -348,6 +349,7 @@ def __post_init__(self):
# Cortex configuration
cortex_api_key=getenv("WORDCAB_TRANSCRIBE_API_KEY", ""),
# AWS configuration
send_results_to_s3=getenv("SEND_RESULTS_TO_S3", False),
aws_access_key_id=getenv("AWS_ACCESS_KEY_ID", ""),
aws_secret_access_key=getenv("AWS_SECRET_ACCESS_KEY", ""),
aws_storage_bucket_name=getenv("AWS_STORAGE_BUCKET_NAME", ""),
Expand Down
101 changes: 71 additions & 30 deletions src/wordcab_transcribe/engines/tensorrt_llm/engine_builder/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,33 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import time
import argparse

import tensorrt_llm
import torch
from loguru import logger

import tensorrt_llm
from tensorrt_llm import str_dtype_to_torch, str_dtype_to_trt
from tensorrt_llm.builder import Builder
from tensorrt_llm.functional import LayerNormPositionType, LayerNormType
from tensorrt_llm.logger import logger
from tensorrt_llm.network import net_guard
from tensorrt_llm.plugin.plugin import ContextFMHAType
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.quantization.quantize_by_modelopt import quantize_model
from weight import load_decoder_weight, load_encoder_weight

MODEL_ENCODER_NAME = "whisper_encoder"
MODEL_DECODER_NAME = "whisper_decoder"

TENSORRT_LLM_VERSION = os.getenv("TENSORRT_LLM_VERSION")
if "0.9.0" in TENSORRT_LLM_VERSION:
from tensorrt_llm.models import quantize_model
elif "0.11.0" in TENSORRT_LLM_VERSION:
from tensorrt_llm.quantization.quantize_by_modelopt import quantize_model
else:
raise ValueError(f"Unsupported version of tensorrt_llm: {TENSORRT_LLM_VERSION}")

def get_engine_name(model, dtype, tp_size=1, rank=0):
return "{}_{}_tp{}_rank{}.engine".format(model, dtype, tp_size, rank)
Expand Down Expand Up @@ -79,7 +87,7 @@ def parse_arguments():
parser.add_argument("--quantize_dir", type=str, default="quantize/1-gpu")
parser.add_argument("--dtype", type=str, default="float16", choices=["float16"])
parser.add_argument("--log_level", type=str, default="info")
parser.add_argument("--max_batch_size", type=int, default=24)
parser.add_argument("--max_batch_size", type=int, default=16)
parser.add_argument("--max_input_len", type=int, default=4)
parser.add_argument("--max_output_len", type=int, default=448)
parser.add_argument("--max_beam_width", type=int, default=1)
Expand Down Expand Up @@ -315,32 +323,63 @@ def build_decoder(model, args):
int8=args.quant_mode.has_act_or_weight_quant(),
)

tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel(
tensorrt_llm.models.modeling_utils.PretrainedConfig(
architecture="whisper",
dtype=str_dtype_to_trt(args.dtype),
logits_dtype=str_dtype_to_trt(args.dtype),
vocab_size=model_metadata["n_vocab"],
max_position_embeddings=model_metadata["n_text_ctx"],
hidden_size=model_metadata["n_text_state"],
num_hidden_layers=model_metadata["n_text_layer"],
num_attention_heads=model_metadata["n_text_head"],
num_key_value_heads=model_metadata["n_text_head"],
hidden_act="gelu",
intermediate_size=4 * model_metadata["n_text_state"],
norm_epsilon=1e-5,
position_embedding_type="learned_absolute",
world_size=1,
tp_size=1,
pp_size=1,
gpus_per_node=1,
quantization=tensorrt_llm.models.modeling_utils.QuantConfig(),
head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
try:
tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel(
tensorrt_llm.models.modeling_utils.PretrainedConfig(
architecture="whisper",
dtype=str_dtype_to_trt(args.dtype),
logits_dtype=str_dtype_to_trt(args.dtype),
vocab_size=model_metadata["n_vocab"],
max_position_embeddings=model_metadata["n_text_ctx"],
hidden_size=model_metadata["n_text_state"],
num_hidden_layers=model_metadata["n_text_layer"],
num_attention_heads=model_metadata["n_text_head"],
num_key_value_heads=model_metadata["n_text_head"],
hidden_act="gelu",
intermediate_size=4 * model_metadata["n_text_state"],
norm_epsilon=1e-5,
position_embedding_type="learned_absolute",
world_size=1,
tp_size=1,
pp_size=1,
gpus_per_node=1,
quantization=tensorrt_llm.models.modeling_utils.QuantConfig(),
head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
num_layers=model_metadata["n_text_layer"],
num_heads=model_metadata["n_text_head"],
ffn_hidden_size=4 * model_metadata["n_text_state"],
encoder_hidden_size=model_metadata["n_text_state"],
encoder_num_heads=model_metadata["n_text_head"],
has_position_embedding=True,
relative_attention=False,
max_distance=0,
num_buckets=0,
has_embedding_layernorm=False,
has_embedding_scale=False,
q_scaling=1.0,
has_attention_qkvo_bias=True,
has_mlp_bias=True,
has_model_final_layernorm=True,
layernorm_eps=1e-5,
layernorm_position=LayerNormPositionType.pre_layernorm,
layernorm_type=LayerNormType.LayerNorm,
rescale_before_lm_head=False,
encoder_head_size=model_metadata["n_text_state"]
// model_metadata["n_text_head"], # Added missing variable
skip_cross_qkv=False,
)
)
except:
tensorrt_llm_whisper_decoder = tensorrt_llm.models.DecoderModel(
num_layers=model_metadata["n_text_layer"],
num_heads=model_metadata["n_text_head"],
hidden_size=model_metadata["n_text_state"],
ffn_hidden_size=4 * model_metadata["n_text_state"],
encoder_hidden_size=model_metadata["n_text_state"],
encoder_num_heads=model_metadata["n_text_head"],
vocab_size=model_metadata["n_vocab"],
head_size=model_metadata["n_text_state"] // model_metadata["n_text_head"],
max_position_embeddings=model_metadata["n_text_ctx"],
has_position_embedding=True,
relative_attention=False,
max_distance=0,
Expand All @@ -354,12 +393,11 @@ def build_decoder(model, args):
layernorm_eps=1e-5,
layernorm_position=LayerNormPositionType.pre_layernorm,
layernorm_type=LayerNormType.LayerNorm,
hidden_act="gelu",
rescale_before_lm_head=False,
encoder_head_size=model_metadata["n_text_state"]
// model_metadata["n_text_head"], # Added missing variable
skip_cross_qkv=False,
dtype=str_dtype_to_trt(args.dtype),
logits_dtype=str_dtype_to_trt(args.dtype),
)
)

if args.use_weight_only:
tensorrt_llm_whisper_decoder = quantize_model(
Expand Down Expand Up @@ -394,7 +432,10 @@ def build_decoder(model, args):
model_metadata["n_audio_ctx"],
)

tensorrt_llm_whisper_decoder(**inputs)
if "0.9.0" in TENSORRT_LLM_VERSION:
tensorrt_llm_whisper_decoder(*inputs)
else:
tensorrt_llm_whisper_decoder(**inputs)

if args.debug_mode:
for k, v in tensorrt_llm_whisper_decoder.named_network_outputs():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,31 @@
}


TRT_BUILD_MAX_OUTPUT_LEN = os.getenv("TRT_BUILD_MAX_OUTPUT_LEN", None)
TRT_BUILD_MAX_BEAM_WIDTH = os.getenv("TRT_BUILD_MAX_BEAM_WIDTH", None)
if not TRT_BUILD_MAX_OUTPUT_LEN:
TRT_BUILD_MAX_OUTPUT_LEN = 448
else:
TRT_BUILD_MAX_OUTPUT_LEN = int(TRT_BUILD_MAX_OUTPUT_LEN)
logger.info(f"TRT_BUILD_MAX_OUTPUT_LEN: {TRT_BUILD_MAX_OUTPUT_LEN}")

if not TRT_BUILD_MAX_BEAM_WIDTH:
TRT_BUILD_MAX_BEAM_WIDTH = 1
else:
TRT_BUILD_MAX_BEAM_WIDTH = int(TRT_BUILD_MAX_BEAM_WIDTH)
logger.info(f"TRT_BUILD_MAX_BEAM_WIDTH: {TRT_BUILD_MAX_BEAM_WIDTH}")


def build_whisper_trt_model(
output_dir,
use_gpt_attention_plugin=True,
use_gemm_plugin=True,
use_bert_attention_plugin=True,
enable_context_fmha=True,
use_weight_only=False,
model_name="distil-large-v2",
max_output_len=TRT_BUILD_MAX_OUTPUT_LEN,
max_beam_width=TRT_BUILD_MAX_BEAM_WIDTH,
model_name="large-v3",
):
"""
Build a Whisper model using the specified configuration.
Expand Down Expand Up @@ -158,6 +175,10 @@ def build_whisper_trt_model(
command.append("--enable_context_fmha")
if use_weight_only:
command.append("--use_weight_only")
if max_output_len:
command.extend(["--max_output_len", str(max_output_len)])
if max_beam_width:
command.extend(["--max_beam_width", str(max_beam_width)])

try:
subprocess.run(command, check=True)
Expand Down
8 changes: 5 additions & 3 deletions src/wordcab_transcribe/engines/tensorrt_llm/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ def exact_div(x, y):
"best_of": 5,
"patience": 1,
"length_penalty": 1,
"repetition_penalty": 1.01,
"repetition_penalty": 1.05,
"no_repeat_ngram_size": 0,
"compression_ratio_threshold": 2.4,
"log_prob_threshold": -1.0,
"no_speech_threshold": 0.3,
"no_speech_threshold": 0.4,
"prefix": None,
"suppress_blank": False,
"suppress_tokens": [-1],
Expand All @@ -61,7 +61,7 @@ def exact_div(x, y):
"best_of": 1,
"patience": 2,
"length_penalty": 1,
"repetition_penalty": 1.01,
"repetition_penalty": 1.05,
"no_repeat_ngram_size": 0,
"compression_ratio_threshold": 2.4,
"log_prob_threshold": -1.0,
Expand Down Expand Up @@ -224,7 +224,9 @@ def align_words(
start_seq_wise_req[_sot_seq] = [_idx]

token_alignments = [[] for _ in seg_metadata]

for start_seq, req_idx in start_seq_wise_req.items():

res = self.align_model.align(
ctranslate2.StorageView.from_array(features[req_idx]),
start_sequence=list(start_seq),
Expand Down
4 changes: 2 additions & 2 deletions src/wordcab_transcribe/engines/tensorrt_llm/trt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ def get_session(self, engine_dir, runtime_mapping, debug_mode=False):

# TODO: Make dynamic max_batch_size and max_beam_width
decoder_model_config = ModelConfig(
max_batch_size=24,
max_beam_width=1,
max_batch_size=16,
max_beam_width=5,
num_heads=self.decoder_config["num_heads"],
num_kv_heads=self.decoder_config["num_heads"],
hidden_size=self.decoder_config["hidden_size"],
Expand Down
1 change: 1 addition & 0 deletions src/wordcab_transcribe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ class BaseRequest(BaseModel):
diarization: bool = False
batch_size: int = 1
source_lang: str = "en"
num_beams: int = 1
timestamps: Timestamps = Timestamps.seconds
vocab: Union[List[str], None] = None
word_timestamps: bool = False
Expand Down
Loading
Loading