diff --git a/README.md b/README.md index 09009fe..10f860f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,9 @@ # LinTO-STT -LinTO-STT is the transcription service within the [LinTO stack](https://github.com/linto-ai/linto-platform-stack), -which can currently work with Speech-To-Text (STT) models. +LinTO-STT is an API for Automatic Speech Recognition (ASR). + +LinTO-STT can either be used as a standalone transcription service or deployed within a micro-services infrastructure using a message broker connector. + The following families of STT models are currently supported (please refer to respective documentation for more details): * [Kaldi models](kaldi/README.md) * [Whisper models](whisper/README.md) diff --git a/kaldi/Dockerfile b/kaldi/Dockerfile index f062951..a28632e 100644 --- a/kaldi/Dockerfile +++ b/kaldi/Dockerfile @@ -1,5 +1,5 @@ FROM python:3.9 -LABEL maintainer="irebai@linagora.com, rbaraglia@linagora.com" +LABEL maintainer="contact@linto.ai, jlouradour@linagora.com, dgaynullin@linagora.com" ARG KALDI_MKL diff --git a/kaldi/README.md b/kaldi/README.md index 0e3a31a..7ebfa85 100644 --- a/kaldi/README.md +++ b/kaldi/README.md @@ -1,7 +1,6 @@ # LinTO-STT-Kaldi -LinTO-STT-Kaldi is the transcription service within the [LinTO stack](https://github.com/linto-ai/linto-platform-stack) -based on Speech-To-Text (STT) models trained with [Kaldi](https://github.com/kaldi-asr/kaldi). +LinTO-STT-Kaldi is an API for Automatic Speech Recognition (ASR) based on models trained with [Kaldi](https://github.com/kaldi-asr/kaldi). LinTO-STT-Kaldi can either be used as a standalone transcription service or deployed within a micro-services infrastructure using a message broker connector. diff --git a/kaldi/requirements.txt b/kaldi/requirements.txt index 867a095..5eec3f4 100644 --- a/kaldi/requirements.txt +++ b/kaldi/requirements.txt @@ -2,7 +2,7 @@ celery[redis,auth,msgpack]>=4.4.7 numpy>=1.18.5 flask>=1.1.2 flask-cors>=3.0.10 -flask-swagger-ui>=3.36.0 +flask-swagger-ui==3.36.0 flask-sock gevent gunicorn diff --git a/wait-for-it.sh b/wait-for-it.sh index 92cbdbb..f6f20d1 100755 --- a/wait-for-it.sh +++ b/wait-for-it.sh @@ -67,6 +67,8 @@ wait_for_wrapper() return $WAITFORIT_RESULT } +echo "NOCOMMIT wait-for-it $*" + # process arguments while [[ $# -gt 0 ]] do @@ -173,7 +175,7 @@ fi if [[ $WAITFORIT_CLI != "" ]]; then if [[ $WAITFORIT_RESULT -ne 0 && $WAITFORIT_STRICT -eq 1 ]]; then - echoerr "$WAITFORIT_cmdname: strict mode, refusing to execute subprocess" + echoerr "$WAITFORIT_cmdname returns $WAITFORIT_CLI: strict mode, refusing to execute subprocess" exit $WAITFORIT_RESULT fi exec "${WAITFORIT_CLI[@]}" diff --git a/whisper/.envdefault b/whisper/.envdefault index 88c27ea..75919c0 100644 --- a/whisper/.envdefault +++ b/whisper/.envdefault @@ -13,13 +13,18 @@ BROKER_PASS= # STT MODELING PARAMETERS ############################################ -# The model can be a path to a model, or a model name ("tiny", "base", "small", "medium", "large-v1", "large-v2" or "large-v3") -MODEL=medium +# The model can be a path to a model (e.g. "/root/.cache/whisper/large-v3.pt", "/root/.cache/huggingface/hub/models--openai--whisper-large-v3"), +# or a model size ("tiny", "base", "small", "medium", "large-v1", "large-v2" or "large-v3") +# or a HuggingFace model name (e.g. "distil-whisper/distil-large-v2") +MODEL=large-v3 # The language can be in different formats: "en", "en-US", "English", ... # If not set or set to "*", the language will be detected automatically. LANGUAGE=* +# Prompt to use for the model. This can be used to provide context to the model, to encourage disfluencies or a special behaviour regarding punctuation and capitalization. +PROMPT= + # An alignment wav2vec model can be used to get word timestamps. # It can be a path to a model, a language code (fr, en, ...), or "wav2vec" to automatically chose a model for the language # This option is experimental (and not implemented with ctranslate2). @@ -30,7 +35,9 @@ LANGUAGE=* ############################################ # Device to use. It can be "cuda" to force/check GPU, "cpu" to force computation on CPU, or a specific GPU ("cuda:0", "cuda:1", ...) -# DEVICE=cuda:0 +# DEVICE=cuda +# CUDA_DEVICE_ORDER=PCI_BUS_ID +# CUDA_VISIBLE_DEVICES=0 # Number of threads per worker when running on CPU OMP_NUM_THREADS=4 diff --git a/whisper/Dockerfile.ctranslate2 b/whisper/Dockerfile.ctranslate2 index 52fbc44..ed19116 100644 --- a/whisper/Dockerfile.ctranslate2 +++ b/whisper/Dockerfile.ctranslate2 @@ -1,5 +1,5 @@ FROM ghcr.io/opennmt/ctranslate2:latest-ubuntu20.04-cuda11.2 -LABEL maintainer="jlouradour@linagora.com" +LABEL maintainer="contact@linto.ai, jlouradour@linagora.com, dgaynullin@linagora.com" RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ffmpeg git diff --git a/whisper/Dockerfile.ctranslate2.cpu b/whisper/Dockerfile.ctranslate2.cpu index c8d6972..df5eac7 100644 --- a/whisper/Dockerfile.ctranslate2.cpu +++ b/whisper/Dockerfile.ctranslate2.cpu @@ -1,5 +1,5 @@ FROM python:3.9 -LABEL maintainer="jlouradour@linagora.com" +LABEL maintainer="contact@linto.ai, jlouradour@linagora.com, dgaynullin@linagora.com" RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ffmpeg git diff --git a/whisper/Dockerfile.torch b/whisper/Dockerfile.torch index 2f3a0d0..06b22f3 100644 --- a/whisper/Dockerfile.torch +++ b/whisper/Dockerfile.torch @@ -1,5 +1,5 @@ FROM python:3.9 -LABEL maintainer="jlouradour@linagora.com" +LABEL maintainer="contact@linto.ai, jlouradour@linagora.com, dgaynullin@linagora.com" RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg diff --git a/whisper/Dockerfile.torch.cpu b/whisper/Dockerfile.torch.cpu index e9198d5..17a3fb8 100644 --- a/whisper/Dockerfile.torch.cpu +++ b/whisper/Dockerfile.torch.cpu @@ -1,5 +1,5 @@ FROM python:3.9 -LABEL maintainer="jlouradour@linagora.com" +LABEL maintainer="contact@linto.ai, jlouradour@linagora.com, dgaynullin@linagora.com" RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg diff --git a/whisper/README.md b/whisper/README.md index 20a3c7d..41dc46a 100644 --- a/whisper/README.md +++ b/whisper/README.md @@ -1,17 +1,71 @@ # LinTO-STT-Whisper -LinTO-STT-Whisper is the transcription service within the [LinTO stack](https://github.com/linto-ai/linto-platform-stack) -based on Speech-To-Text (STT) [Whisper models](https://openai.com/research/whisper). +LinTO-STT-Whisper is an API for Automatic Speech Recognition (ASR) based on [Whisper models](https://openai.com/research/whisper). LinTO-STT-Whisper can either be used as a standalone transcription service or deployed within a micro-services infrastructure using a message broker connector. ## Pre-requisites +### Requirements + +The transcription service requires [docker](https://www.docker.com/products/docker-desktop/) up and running. + +For GPU capabilities, it is also needed to install +[nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). + ### Hardware + To run the transcription models you'll need: -* At least 8Go of disk space to build the docker image. +* At least 8GB of disk space to build the docker image + and models can occupy several GB of disk space depending on the model size (it can be up to 5GB). * Up to 7GB of RAM depending on the model used. -* One CPU per worker. Inference time scales on CPU performances. +* One CPU per worker. Inference time scales on CPU performances. + +On GPU, approximate VRAM peak usage are indicated in the following table +for some model sizes, depending on the backend +(note that the lowest precision supported by the GPU card is automatically chosen when loading the model). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Model sizeBackend and precision
[ct2/faster_whisper](whisper/Dockerfile.ctranslate2) [torch/whisper_timestamped](whisper/Dockerfile.torch)
int8float16float32float32
tiny1.5G1.5G
distil-whisper/distil-large-v22.2G3.2G4.8G4.4G
large (large-v3, ...)2.8G4.8G8.2G10.4G
### Model(s) @@ -23,8 +77,8 @@ and can occupy several GB of disk space. LinTO-STT-Whisper has also the option to work with a wav2vec model to perform word alignment. The wav2vec model can be specified either -* (TorchAudio) with a string corresponding to a `torchaudio` pipeline (e.g. "WAV2VEC2_ASR_BASE_960H") or -* (HuggingFace's Transformers) with a string corresponding to a HuggingFace repository of a wav2vec model (e.g. "jonatasgrosman/wav2vec2-large-xlsr-53-english"), or +* (TorchAudio) with a string corresponding to a `torchaudio` pipeline (e.g. `WAV2VEC2_ASR_BASE_960H`) or +* (HuggingFace's Transformers) with a string corresponding to a HuggingFace repository of a wav2vec model (e.g. `jonatasgrosman/wav2vec2-large-xlsr-53-english`), or * (SpeechBrain) with a path corresponding to a folder with a SpeechBrain model Default wav2vec models are provided for French (fr), English (en), Spanish (es), German (de), Dutch (nl), Japanese (ja), Chinese (zh). @@ -32,8 +86,6 @@ Default wav2vec models are provided for French (fr), English (en), Spanish (es), But we advise not to use a companion wav2vec alignment model. This is not needed neither tested anymore. -### Docker -The transcription service requires docker up and running. ### (micro-service) Service broker and shared folder The STT only entry point in task mode are tasks posted on a message broker. Supported message broker are RabbitMQ, Redis, Amazon SQS. @@ -63,14 +115,16 @@ cp whisper/.envdefault whisper/.env | PARAMETER | DESCRIPTION | EXEMPLE | |---|---|---| | SERVICE_MODE | STT serving mode see [Serving mode](#serving-mode) | `http` \| `task` | -| MODEL | Path to a Whisper model, type of Whisper model used, or HuggingFace identifier of a Whisper model. | \ \| `large-v3` \| `distil-whisper/distil-large-v2` \| ... | +| MODEL | Path to a Whisper model, type of Whisper model used, or HuggingFace identifier of a Whisper model. | `large-v3` \| `distil-whisper/distil-large-v2` \| \ \| ... | | LANGUAGE | (Optional) Language to recognize | `*` \| `fr` \| `fr-FR` \| `French` \| `en` \| `en-US` \| `English` \| ... | | PROMPT | (Optional) Prompt to use for the Whisper model | `some free text to encourage a certain transcription style (disfluencies, no punctuation, ...)` | -| ALIGNMENT_MODEL | (Optional) Path to the wav2vec model for word alignment, or name of HuggingFace repository or torchaudio pipeline | \ \| `WAV2VEC2_ASR_BASE_960H` \| `jonatasgrosman/wav2vec2-large-xlsr-53-english` \| ... | -| CONCURRENCY | Maximum number of parallel requests | `3` | +| ALIGNMENT_MODEL | (Optional and deprecated) Path to the wav2vec model for word alignment, or name of HuggingFace repository or torchaudio pipeline | `WAV2VEC2_ASR_BASE_960H` \| `jonatasgrosman/wav2vec2-large-xlsr-53-english` \| \ \| ... | +| DEVICE | (Optional) Device to use for the model | `cpu` \| `cuda` ... | +| CUDA_VISIBLE_DEVICES | (Optional) GPU device index to use, if several. We also recommend to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` on multi-GPU machines | `0` \| `1` \| `2` \| ... | +| CONCURRENCY | Maximum number of parallel requests | `2` | | SERVICE_NAME | (For the task mode) queue's name for task processing | `my-stt` | | SERVICE_BROKER | (For the task mode) URL of the message broker | `redis://my-broker:6379` | -| BROKER_PASS | (For the task mode only) broker password | `my-password` | +| BROKER_PASS | (For the task mode only) broker password | `my-password` \| (empty) | #### MODEL environment variable @@ -79,7 +133,7 @@ The model will be (downloaded if required and) loaded in memory when calling the When using a Whisper model from Hugging Face (transformers) along with ctranslate2 (faster_whisper), it will also download torch library to make the conversion from torch to ctranslate2. -If you want to preload the model (and later specify a path `ASR_PATH` as `MODEL`), +If you want to preload the model (and later specify a path `` as `MODEL`), you may want to download one of OpenAI Whisper models: * Mutli-lingual Whisper models can be downloaded with the following links: * [tiny](https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt) @@ -144,26 +198,28 @@ The SERVICE_MODE value in the .env should be set to ```http```. ```bash docker run --rm \ -p HOST_SERVING_PORT:80 \ --v ASR_PATH:/opt/model.pt \ --env-file whisper/.env \ linto-stt-whisper:latest ``` This will run a container providing an [HTTP API](#http-api) binded on the host HOST_SERVING_PORT port. -You may also want to mount your cache folder CACHE_PATH (e.g. "~/.cache") ```-v CACHE_PATH:/root/.cache``` -in order to avoid downloading models each time. - -Also if you want to specifiy a custom alignment model already downloaded in a folder WAV2VEC_PATH, -you can add option ```-v WAV2VEC_PATH:/opt/wav2vec``` and environment variable ```ALIGNMENT_MODEL=/opt/wav2vec```. +You may also want to add specific options: +* To enable GPU capabilities, add ```--gpus all```. + Note that you can use environment variable `DEVICE=cuda` to make sure GPU is used (and maybe set `CUDA_VISIBLE_DEVICES` if there are several available GPU cards). +* To mount a local cache folder `` (e.g. "`$HOME/.cache`") and avoid downloading models each time, + use ```-v :/root/.cache``` + If you use `MODEL=/opt/model.pt` environment variable, you may want to mount the model file (or folder) with option ```-v :/opt/model.pt```. +* If you want to specifiy a custom alignment model already downloaded in a folder ``, + you can add option ```-v :/opt/wav2vec``` and environment variable ```ALIGNMENT_MODEL=/opt/wav2vec```. **Parameters:** | Variables | Description | Example | |:-|:-|:-| -| HOST_SERVING_PORT | Host serving port | 8080 | -| ASR_PATH | Path to the Whisper model on the host machine mounted to /opt/model.pt | /my/path/to/models/medium.pt | -| CACHE_PATH | (Optional) Path to a folder to download wav2vec alignment models when relevant | /home/username/.cache | -| WAV2VEC_PATH | (Optional) Path to a folder to a custom wav2vec alignment model | /my/path/to/models/wav2vec | +| `HOST_SERVING_PORT` | Host serving port | 8080 | +| `` | (Optional) Path to a folder to download wav2vec alignment models when relevant | /home/username/.cache | +| `` | Path to the Whisper model on the host machine mounted to /opt/model.pt | /my/path/to/models/medium.pt | +| `` | (Optional) Path to a folder to a custom wav2vec alignment model | /my/path/to/models/wav2vec | ### Micro-service within LinTO-Platform stack The TASK serving mode connect a celery worker to a message broker. @@ -174,25 +230,27 @@ You need a message broker up and running at MY_SERVICE_BROKER. ```bash docker run --rm \ --v ASR_PATH:/opt/model.pt \ -v SHARED_AUDIO_FOLDER:/opt/audio \ --env-file whisper/.env \ linto-stt-whisper:latest ``` -You may also want to mount your cache folder CACHE_PATH (e.g. "~/.cache") ```-v CACHE_PATH:/root/.cache``` -in order to avoid downloading models each time. - -Also if you want to specifiy a custom alignment model already downloaded in a folder WAV2VEC_PATH, -you can add option ```-v WAV2VEC_PATH:/opt/wav2vec``` and environment variable ```ALIGNMENT_MODEL=/opt/wav2vec```. +You may also want to add specific options: +* To enable GPU capabilities, add ```--gpus all```. + Note that you can use environment variable `DEVICE=cuda` to make sure GPU is used (and maybe set `CUDA_VISIBLE_DEVICES` if there are several available GPU cards). +* To mount a local cache folder `` (e.g. "`$HOME/.cache`") and avoid downloading models each time, + use ```-v :/root/.cache``` + If you use `MODEL=/opt/model.pt` environment variable, you may want to mount the model file (or folder) with option ```-v :/opt/model.pt```. +* If you want to specifiy a custom alignment model already downloaded in a folder ``, + you can add option ```-v :/opt/wav2vec``` and environment variable ```ALIGNMENT_MODEL=/opt/wav2vec```. **Parameters:** | Variables | Description | Example | |:-|:-|:-| -| SHARED_AUDIO_FOLDER | Shared audio folder mounted to /opt/audio | /my/path/to/models/vosk-model | -| ASR_PATH | Path to the Whisper model on the host machine mounted to /opt/model.pt | /my/path/to/models/medium.pt | -| CACHE_PATH | (Optional) Path to a folder to download wav2vec alignment models when relevant | /home/username/.cache | -| WAV2VEC_PATH | (Optional) Path to a folder to a custom wav2vec alignment model | /my/path/to/models/wav2vec | +| `` | Shared audio folder mounted to /opt/audio | /my/path/to/models/vosk-model | +| `` | (Optional) Path to a folder to download wav2vec alignment models when relevant | /home/username/.cache | +| `` | Path to the Whisper model on the host machine mounted to /opt/model.pt | /my/path/to/models/medium.pt | +| `` | (Optional) Path to a folder to a custom wav2vec alignment model | /my/path/to/models/wav2vec | ## Usages @@ -274,9 +332,10 @@ This project is developped under the AGPLv3 License (see LICENSE). ## Acknowlegment. -* [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) -* [OpenAI Whisper](https://github.com/openai/whisper) * [Ctranslate2](https://github.com/OpenNMT/CTranslate2) + * [Faster-Whisper](https://github.com/SYSTRAN/faster-whisper) +* [OpenAI Whisper](https://github.com/openai/whisper) + * [Whisper-Timestamped](https://github.com/linto-ai/whisper-timestamped) +* [HuggingFace Transformers](https://github.com/huggingface/transformers) * [SpeechBrain](https://github.com/speechbrain/speechbrain) * [TorchAudio](https://github.com/pytorch/audio) -* [HuggingFace Transformers](https://github.com/huggingface/transformers) \ No newline at end of file diff --git a/whisper/RELEASE.md b/whisper/RELEASE.md index 2967139..4d46f19 100644 --- a/whisper/RELEASE.md +++ b/whisper/RELEASE.md @@ -1,3 +1,7 @@ +# 1.0.1 +- support of model.safetensors +- ct2/faster_whisper: Information about used precision added in the logs + # 1.0.0 - First build of linto-stt-whisper - Based on 4.0.5 of linto-stt https://github.com/linto-ai/linto-stt/blob/a54b7b7ac2bc491a1795bb6dfb318a39c8b76d63/RELEASE.md diff --git a/whisper/requirements.ctranslate2.txt b/whisper/requirements.ctranslate2.txt index 2ddc118..530dcff 100644 --- a/whisper/requirements.ctranslate2.txt +++ b/whisper/requirements.ctranslate2.txt @@ -2,7 +2,7 @@ celery[redis,auth,msgpack]>=4.4.7 flask>=1.1.2 flask-cors>=3.0.10 flask-sock -flask-swagger-ui>=3.36.0 +flask-swagger-ui==3.36.0 gevent gunicorn lockfile diff --git a/whisper/requirements.torch.txt b/whisper/requirements.torch.txt index 75e747c..3976414 100644 --- a/whisper/requirements.torch.txt +++ b/whisper/requirements.torch.txt @@ -2,7 +2,7 @@ celery[redis,auth,msgpack]>=4.4.7 flask>=1.1.2 flask-cors>=3.0.10 flask-sock -flask-swagger-ui>=3.36.0 +flask-swagger-ui==3.36.0 gevent gunicorn lockfile @@ -13,7 +13,6 @@ speechbrain transformers wavio>=0.0.4 websockets -# openai-whisper -git+https://github.com/linto-ai/whisper-timestamped.git +whisper-timestamped onnxruntime torchaudio \ No newline at end of file diff --git a/whisper/stt/processing/load_model.py b/whisper/stt/processing/load_model.py index b87a414..c3f1e88 100644 --- a/whisper/stt/processing/load_model.py +++ b/whisper/stt/processing/load_model.py @@ -65,20 +65,41 @@ def load_whisper_model(model_type_or_file, device="cpu", download_root=None): ) logger.info(f"CTranslate2 model in {output_dir}") if not os.path.isdir(output_dir): - import huggingface_hub + from transformers.utils import cached_file + import json + kwargs = dict(cache_dir=download_root, use_auth_token=None, revision=None) delete_hf_path = False if not os.path.isdir(model_type_or_file): - hf_path = huggingface_hub.hf_hub_download( - repo_id=model_type_or_file, filename="pytorch_model.bin" - ) + model_path = None + hf_path = None + for candidate in ["pytorch_model.bin", "model.safetensors", "whisper.ckpt", "pytorch_model.bin.index.json", "model.safetensors.index.json"]: + try: + hf_path = model_path = cached_file(model_type_or_file, candidate, **kwargs) + except OSError: + continue + if candidate.endswith("index.json"): + index_file = model_path + mapping = json.load(open(index_file)) + assert "weight_map" in mapping + assert isinstance(mapping["weight_map"], dict) + model_path = list(set(mapping["weight_map"].values())) + folder = os.path.dirname(index_file) + model_path = [os.path.join(folder, p) for p in model_path] + break + if model_path is None: + raise RuntimeError(f"Could not find model {model_type_or_file} from HuggingFace nor local folders.") hf_path = os.path.dirname(os.path.dirname(os.path.dirname(hf_path))) - delete_hf_path = not os.path.exists(hf_path) else: - assert os.path.isfile( - os.path.join(model_type_or_file, "pytorch_model.bin") - ), f"Could not find pytorch_model.bin in {model_type_or_file}" + hf_path = None + for candidate in ["pytorch_model.bin", "model.safetensors", "whisper.ckpt", "pytorch_model.bin.index.json", "model.safetensors.index.json"]: + model_path = os.path.join(model_type_or_file, candidate) + if os.path.exists(model_path): + hf_path = model_path + break + if hf_path is None: + raise RuntimeError(f"Could not find pytorch_model.bin in {model_type_or_file}") check_torch_installed() @@ -135,6 +156,7 @@ def load_whisper_model(model_type_or_file, device="cpu", download_root=None): # num_workers=1, # download_root=os.path.join(download_root, f"huggingface/hub/models--guillaumekln--faster-whisper-{model_type_or_file}"), ) + logger.info(f"Whisper model loaded with compute_type={compute_type}. (t={time.time() - start}s)") break except ValueError as err: logger.info(