diff --git a/README.md b/README.md
index 09009fe..10f860f 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,9 @@
 # LinTO-STT
 
-LinTO-STT is the transcription service within the [LinTO stack](https://github.com/linto-ai/linto-platform-stack),
-which can currently work with Speech-To-Text (STT) models.
+LinTO-STT is an API for Automatic Speech Recognition (ASR).
+
+LinTO-STT can either be used as a standalone transcription service or deployed within a micro-services infrastructure using a message broker connector.
+
 The following families of STT models are currently supported (please refer to respective documentation for more details):
 * [Kaldi models](kaldi/README.md) 
 * [Whisper models](whisper/README.md)
diff --git a/kaldi/Dockerfile b/kaldi/Dockerfile
index f062951..a28632e 100644
--- a/kaldi/Dockerfile
+++ b/kaldi/Dockerfile
@@ -1,5 +1,5 @@
 FROM python:3.9
-LABEL maintainer="irebai@linagora.com, rbaraglia@linagora.com"
+LABEL maintainer="contact@linto.ai, jlouradour@linagora.com, dgaynullin@linagora.com"
 
 ARG KALDI_MKL
 
diff --git a/kaldi/README.md b/kaldi/README.md
index 0e3a31a..7ebfa85 100644
--- a/kaldi/README.md
+++ b/kaldi/README.md
@@ -1,7 +1,6 @@
 # LinTO-STT-Kaldi
 
-LinTO-STT-Kaldi is the transcription service within the [LinTO stack](https://github.com/linto-ai/linto-platform-stack)
-based on Speech-To-Text (STT) models trained with [Kaldi](https://github.com/kaldi-asr/kaldi).
+LinTO-STT-Kaldi is an API for Automatic Speech Recognition (ASR) based on models trained with [Kaldi](https://github.com/kaldi-asr/kaldi).
 
 LinTO-STT-Kaldi can either be used as a standalone transcription service or deployed within a micro-services infrastructure using a message broker connector.
 
diff --git a/kaldi/requirements.txt b/kaldi/requirements.txt
index 867a095..5eec3f4 100644
--- a/kaldi/requirements.txt
+++ b/kaldi/requirements.txt
@@ -2,7 +2,7 @@ celery[redis,auth,msgpack]>=4.4.7
 numpy>=1.18.5
 flask>=1.1.2
 flask-cors>=3.0.10
-flask-swagger-ui>=3.36.0
+flask-swagger-ui==3.36.0
 flask-sock
 gevent
 gunicorn
diff --git a/wait-for-it.sh b/wait-for-it.sh
index 92cbdbb..f6f20d1 100755
--- a/wait-for-it.sh
+++ b/wait-for-it.sh
@@ -67,6 +67,8 @@ wait_for_wrapper()
     return $WAITFORIT_RESULT
 }
 
+echo "NOCOMMIT wait-for-it $*"
+
 # process arguments
 while [[ $# -gt 0 ]]
 do
@@ -173,7 +175,7 @@ fi
 
 if [[ $WAITFORIT_CLI != "" ]]; then
     if [[ $WAITFORIT_RESULT -ne 0 && $WAITFORIT_STRICT -eq 1 ]]; then
-        echoerr "$WAITFORIT_cmdname: strict mode, refusing to execute subprocess"
+        echoerr "$WAITFORIT_cmdname returns $WAITFORIT_CLI: strict mode, refusing to execute subprocess"
         exit $WAITFORIT_RESULT
     fi
     exec "${WAITFORIT_CLI[@]}"
diff --git a/whisper/.envdefault b/whisper/.envdefault
index 88c27ea..75919c0 100644
--- a/whisper/.envdefault
+++ b/whisper/.envdefault
@@ -13,13 +13,18 @@ BROKER_PASS=
 # STT MODELING PARAMETERS
 ############################################
 
-# The model can be a path to a model, or a model name ("tiny", "base", "small", "medium", "large-v1", "large-v2" or "large-v3")
-MODEL=medium
+# The model can be a path to a model (e.g. "/root/.cache/whisper/large-v3.pt", "/root/.cache/huggingface/hub/models--openai--whisper-large-v3"),
+# or a model size ("tiny", "base", "small", "medium", "large-v1", "large-v2" or "large-v3")
+# or a HuggingFace model name (e.g. "distil-whisper/distil-large-v2")
+MODEL=large-v3
 
 # The language can be in different formats: "en", "en-US", "English", ...
 # If not set or set to "*", the language will be detected automatically.
 LANGUAGE=*
 
+# Prompt to use for the model. This can be used to provide context to the model, to encourage disfluencies or a special behaviour regarding punctuation and capitalization.
+PROMPT=
+
 # An alignment wav2vec model can be used to get word timestamps.
 # It can be a path to a model, a language code (fr, en, ...), or "wav2vec" to automatically chose a model for the language
 # This option is experimental (and not implemented with ctranslate2).
@@ -30,7 +35,9 @@ LANGUAGE=*
 ############################################
 
 # Device to use. It can be "cuda" to force/check GPU, "cpu" to force computation on CPU, or a specific GPU ("cuda:0", "cuda:1", ...)
-# DEVICE=cuda:0
+# DEVICE=cuda
+# CUDA_DEVICE_ORDER=PCI_BUS_ID
+# CUDA_VISIBLE_DEVICES=0
 
 # Number of threads per worker when running on CPU
 OMP_NUM_THREADS=4
diff --git a/whisper/Dockerfile.ctranslate2 b/whisper/Dockerfile.ctranslate2
index 52fbc44..ed19116 100644
--- a/whisper/Dockerfile.ctranslate2
+++ b/whisper/Dockerfile.ctranslate2
@@ -1,5 +1,5 @@
 FROM ghcr.io/opennmt/ctranslate2:latest-ubuntu20.04-cuda11.2
-LABEL maintainer="jlouradour@linagora.com"
+LABEL maintainer="contact@linto.ai, jlouradour@linagora.com, dgaynullin@linagora.com"
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ffmpeg git
 
diff --git a/whisper/Dockerfile.ctranslate2.cpu b/whisper/Dockerfile.ctranslate2.cpu
index c8d6972..df5eac7 100644
--- a/whisper/Dockerfile.ctranslate2.cpu
+++ b/whisper/Dockerfile.ctranslate2.cpu
@@ -1,5 +1,5 @@
 FROM python:3.9
-LABEL maintainer="jlouradour@linagora.com"
+LABEL maintainer="contact@linto.ai, jlouradour@linagora.com, dgaynullin@linagora.com"
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ffmpeg git
 
diff --git a/whisper/Dockerfile.torch b/whisper/Dockerfile.torch
index 2f3a0d0..06b22f3 100644
--- a/whisper/Dockerfile.torch
+++ b/whisper/Dockerfile.torch
@@ -1,5 +1,5 @@
 FROM python:3.9
-LABEL maintainer="jlouradour@linagora.com"
+LABEL maintainer="contact@linto.ai, jlouradour@linagora.com, dgaynullin@linagora.com"
 
 RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg
 
diff --git a/whisper/Dockerfile.torch.cpu b/whisper/Dockerfile.torch.cpu
index e9198d5..17a3fb8 100644
--- a/whisper/Dockerfile.torch.cpu
+++ b/whisper/Dockerfile.torch.cpu
@@ -1,5 +1,5 @@
 FROM python:3.9
-LABEL maintainer="jlouradour@linagora.com"
+LABEL maintainer="contact@linto.ai, jlouradour@linagora.com, dgaynullin@linagora.com"
 
 RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg
 
diff --git a/whisper/README.md b/whisper/README.md
index 20a3c7d..41dc46a 100644
--- a/whisper/README.md
+++ b/whisper/README.md
@@ -1,17 +1,71 @@
 # LinTO-STT-Whisper
 
-LinTO-STT-Whisper is the transcription service within the [LinTO stack](https://github.com/linto-ai/linto-platform-stack)
-based on Speech-To-Text (STT) [Whisper models](https://openai.com/research/whisper).
+LinTO-STT-Whisper is an API for Automatic Speech Recognition (ASR) based on [Whisper models](https://openai.com/research/whisper).
 
 LinTO-STT-Whisper can either be used as a standalone transcription service or deployed within a micro-services infrastructure using a message broker connector.
 
 ## Pre-requisites
 
+### Requirements
+
+The transcription service requires [docker](https://www.docker.com/products/docker-desktop/) up and running.
+
+For GPU capabilities, it is also needed to install
+[nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
+
 ### Hardware
+
 To run the transcription models you'll need:
-* At least 8Go of disk space to build the docker image.
+* At least 8GB of disk space to build the docker image
+  and models can occupy several GB of disk space depending on the model size (it can be up to 5GB).
 * Up to 7GB of RAM depending on the model used.
-* One CPU per worker. Inference time scales on CPU performances. 
+* One CPU per worker. Inference time scales on CPU performances.
+
+On GPU, approximate VRAM peak usage are indicated in the following table
+for some model sizes, depending on the backend
+(note that the lowest precision supported by the GPU card is automatically chosen when loading the model).
+<table border="0">
+ <tr>
+    <td rowspan="3"><b>Model size</b></td>
+    <td colspan="4"><b>Backend and precision</b></td>
+ </tr>
+ <tr>
+    <td colspan="3"><b> [ct2/faster_whisper](whisper/Dockerfile.ctranslate2) </b></td>
+    <td><b> [torch/whisper_timestamped](whisper/Dockerfile.torch) </b></td>
+ </tr>
+ <tr>
+    <td><b>int8</b></td>
+    <td><b>float16</b></td>
+    <td><b>float32</b></td>
+    <td><b>float32</b></td>
+ </tr>
+ <tr>
+    <td>tiny</td>
+    <td colspan="3">1.5G</td>
+    <td>1.5G</td>
+ </tr>
+ <!-- <tr>
+    <td>bofenghuang/whisper-large-v3-french-distil-dec2</td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+ </tr> -->
+ <tr>
+    <td>distil-whisper/distil-large-v2</td>
+    <td>2.2G</td>
+    <td>3.2G</td>
+    <td>4.8G</td>
+    <td>4.4G</td>
+ </tr>
+ <tr>
+    <td>large (large-v3, ...)</td>
+    <td>2.8G</td>
+    <td>4.8G</td>
+    <td>8.2G</td>
+    <td>10.4G</td>
+ </tr>
+</table>
 
 ### Model(s)
 
@@ -23,8 +77,8 @@ and can occupy several GB of disk space.
 
 LinTO-STT-Whisper has also the option to work with a wav2vec model to perform word alignment.
 The wav2vec model can be specified either
-* (TorchAudio) with a string corresponding to a `torchaudio` pipeline (e.g. "WAV2VEC2_ASR_BASE_960H") or
-* (HuggingFace's Transformers) with a string corresponding to a HuggingFace repository of a wav2vec model (e.g. "jonatasgrosman/wav2vec2-large-xlsr-53-english"), or
+* (TorchAudio) with a string corresponding to a `torchaudio` pipeline (e.g. `WAV2VEC2_ASR_BASE_960H`) or
+* (HuggingFace's Transformers) with a string corresponding to a HuggingFace repository of a wav2vec model (e.g. `jonatasgrosman/wav2vec2-large-xlsr-53-english`), or
 * (SpeechBrain) with a path corresponding to a folder with a SpeechBrain model
 
 Default wav2vec models are provided for French (fr), English (en), Spanish (es), German (de), Dutch (nl), Japanese (ja), Chinese (zh).
@@ -32,8 +86,6 @@ Default wav2vec models are provided for French (fr), English (en), Spanish (es),
 But we advise not to use a companion wav2vec alignment model.
 This is not needed neither tested anymore.
 
-### Docker
-The transcription service requires docker up and running.
 
 ### (micro-service) Service broker and shared folder
 The STT only entry point in task mode are tasks posted on a message broker. Supported message broker are RabbitMQ, Redis, Amazon SQS.
@@ -63,14 +115,16 @@ cp whisper/.envdefault whisper/.env
 | PARAMETER | DESCRIPTION | EXEMPLE |
 |---|---|---|
 | SERVICE_MODE | STT serving mode see [Serving mode](#serving-mode) | `http` \| `task` |
-| MODEL | Path to a Whisper model, type of Whisper model used, or HuggingFace identifier of a Whisper model. | \<ASR_PATH\> \| `large-v3` \| `distil-whisper/distil-large-v2` \| ... |
+| MODEL | Path to a Whisper model, type of Whisper model used, or HuggingFace identifier of a Whisper model. | `large-v3` \| `distil-whisper/distil-large-v2` \| \<ASR_PATH\> \| ... |
 | LANGUAGE | (Optional) Language to recognize | `*` \| `fr` \| `fr-FR` \| `French` \| `en` \| `en-US` \| `English` \| ... |
 | PROMPT | (Optional) Prompt to use for the Whisper model | `some free text to encourage a certain transcription style (disfluencies, no punctuation, ...)` |
-| ALIGNMENT_MODEL | (Optional) Path to the wav2vec model for word alignment, or name of HuggingFace repository or torchaudio pipeline | \<WAV2VEC_PATH\> \| `WAV2VEC2_ASR_BASE_960H` \| `jonatasgrosman/wav2vec2-large-xlsr-53-english` \| ... |
-| CONCURRENCY | Maximum number of parallel requests | `3` |
+| ALIGNMENT_MODEL | (Optional and deprecated) Path to the wav2vec model for word alignment, or name of HuggingFace repository or torchaudio pipeline | `WAV2VEC2_ASR_BASE_960H` \| `jonatasgrosman/wav2vec2-large-xlsr-53-english` \| \<WAV2VEC_PATH\> \| ... |
+| DEVICE | (Optional) Device to use for the model | `cpu` \| `cuda` ... |
+| CUDA_VISIBLE_DEVICES | (Optional) GPU device index to use, if several. We also recommend to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` on multi-GPU machines | `0` \| `1` \| `2` \| ... |
+| CONCURRENCY | Maximum number of parallel requests | `2` |
 | SERVICE_NAME | (For the task mode) queue's name for task processing | `my-stt` |
 | SERVICE_BROKER | (For the task mode) URL of the message broker | `redis://my-broker:6379` |
-| BROKER_PASS | (For the task mode only) broker password | `my-password` |
+| BROKER_PASS | (For the task mode only) broker password | `my-password` \| (empty) |
 
 #### MODEL environment variable
 
@@ -79,7 +133,7 @@ The model will be (downloaded if required and) loaded in memory when calling the
 When using a Whisper model from Hugging Face (transformers) along with ctranslate2 (faster_whisper),
 it will also download torch library to make the conversion from torch to ctranslate2.
 
-If you want to preload the model (and later specify a path `ASR_PATH` as `MODEL`),
+If you want to preload the model (and later specify a path `<ASR_PATH>` as `MODEL`),
 you may want to download one of OpenAI Whisper models:
 * Mutli-lingual Whisper models can be downloaded with the following links:
     * [tiny](https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt)
@@ -144,26 +198,28 @@ The SERVICE_MODE value in the .env should be set to ```http```.
 ```bash
 docker run --rm \
 -p HOST_SERVING_PORT:80 \
--v ASR_PATH:/opt/model.pt \
 --env-file whisper/.env \
 linto-stt-whisper:latest
 ```
 
 This will run a container providing an [HTTP API](#http-api) binded on the host HOST_SERVING_PORT port.
 
-You may also want to mount your cache folder CACHE_PATH (e.g. "~/.cache") ```-v CACHE_PATH:/root/.cache```
-in order to avoid downloading models each time.
-
-Also if you want to specifiy a custom alignment model already downloaded in a folder WAV2VEC_PATH,
-you can add option ```-v WAV2VEC_PATH:/opt/wav2vec``` and environment variable ```ALIGNMENT_MODEL=/opt/wav2vec```.
+You may also want to add specific options:
+* To enable GPU capabilities, add ```--gpus all```.
+  Note that you can use environment variable `DEVICE=cuda` to make sure GPU is used (and maybe set `CUDA_VISIBLE_DEVICES` if there are several available GPU cards).
+* To mount a local cache folder `<CACHE_PATH>` (e.g. "`$HOME/.cache`") and avoid downloading models each time,
+  use ```-v <CACHE_PATH>:/root/.cache```
+  If you use `MODEL=/opt/model.pt` environment variable, you may want to mount the model file (or folder) with option ```-v <ASR_PATH>:/opt/model.pt```.
+* If you want to specifiy a custom alignment model already downloaded in a folder `<WAV2VEC_PATH>`,
+  you can add option ```-v <WAV2VEC_PATH>:/opt/wav2vec``` and environment variable ```ALIGNMENT_MODEL=/opt/wav2vec```.
 
 **Parameters:**
 | Variables | Description | Example |
 |:-|:-|:-|
-| HOST_SERVING_PORT | Host serving port | 8080 |
-| ASR_PATH | Path to the Whisper model on the host machine mounted to /opt/model.pt | /my/path/to/models/medium.pt |
-| CACHE_PATH | (Optional) Path to a folder to download wav2vec alignment models when relevant | /home/username/.cache |
-| WAV2VEC_PATH | (Optional) Path to a folder to a custom wav2vec alignment model |  /my/path/to/models/wav2vec |
+| `HOST_SERVING_PORT` | Host serving port | 8080 |
+| `<CACHE_PATH>` | (Optional) Path to a folder to download wav2vec alignment models when relevant | /home/username/.cache |
+| `<ASR_PATH>` | Path to the Whisper model on the host machine mounted to /opt/model.pt | /my/path/to/models/medium.pt |
+| `<WAV2VEC_PATH>` | (Optional) Path to a folder to a custom wav2vec alignment model |  /my/path/to/models/wav2vec |
 
 ### Micro-service within LinTO-Platform stack
 The TASK serving mode connect a celery worker to a message broker.
@@ -174,25 +230,27 @@ You need a message broker up and running at MY_SERVICE_BROKER.
 
 ```bash
 docker run --rm \
--v ASR_PATH:/opt/model.pt \
 -v SHARED_AUDIO_FOLDER:/opt/audio \
 --env-file whisper/.env \
 linto-stt-whisper:latest
 ```
 
-You may also want to mount your cache folder CACHE_PATH (e.g. "~/.cache") ```-v CACHE_PATH:/root/.cache```
-in order to avoid downloading models each time.
-
-Also if you want to specifiy a custom alignment model already downloaded in a folder WAV2VEC_PATH,
-you can add option ```-v WAV2VEC_PATH:/opt/wav2vec``` and environment variable ```ALIGNMENT_MODEL=/opt/wav2vec```.
+You may also want to add specific options:
+* To enable GPU capabilities, add ```--gpus all```.
+  Note that you can use environment variable `DEVICE=cuda` to make sure GPU is used (and maybe set `CUDA_VISIBLE_DEVICES` if there are several available GPU cards).
+* To mount a local cache folder `<CACHE_PATH>` (e.g. "`$HOME/.cache`") and avoid downloading models each time,
+  use ```-v <CACHE_PATH>:/root/.cache```
+  If you use `MODEL=/opt/model.pt` environment variable, you may want to mount the model file (or folder) with option ```-v <ASR_PATH>:/opt/model.pt```.
+* If you want to specifiy a custom alignment model already downloaded in a folder `<WAV2VEC_PATH>`,
+  you can add option ```-v <WAV2VEC_PATH>:/opt/wav2vec``` and environment variable ```ALIGNMENT_MODEL=/opt/wav2vec```.
 
 **Parameters:**
 | Variables | Description | Example |
 |:-|:-|:-|
-| SHARED_AUDIO_FOLDER | Shared audio folder mounted to /opt/audio | /my/path/to/models/vosk-model |
-| ASR_PATH | Path to the Whisper model on the host machine mounted to /opt/model.pt | /my/path/to/models/medium.pt |
-| CACHE_PATH | (Optional) Path to a folder to download wav2vec alignment models when relevant | /home/username/.cache |
-| WAV2VEC_PATH | (Optional) Path to a folder to a custom wav2vec alignment model |  /my/path/to/models/wav2vec |
+| `<SHARED_AUDIO_FOLDER>` | Shared audio folder mounted to /opt/audio | /my/path/to/models/vosk-model |
+| `<CACHE_PATH>` | (Optional) Path to a folder to download wav2vec alignment models when relevant | /home/username/.cache |
+| `<ASR_PATH>` | Path to the Whisper model on the host machine mounted to /opt/model.pt | /my/path/to/models/medium.pt |
+| `<WAV2VEC_PATH>` | (Optional) Path to a folder to a custom wav2vec alignment model |  /my/path/to/models/wav2vec |
 
 
 ## Usages
@@ -274,9 +332,10 @@ This project is developped under the AGPLv3 License (see LICENSE).
 
 ## Acknowlegment.
 
-* [Faster Whisper](https://github.com/SYSTRAN/faster-whisper)
-* [OpenAI Whisper](https://github.com/openai/whisper)
 * [Ctranslate2](https://github.com/OpenNMT/CTranslate2)
+   * [Faster-Whisper](https://github.com/SYSTRAN/faster-whisper)
+* [OpenAI Whisper](https://github.com/openai/whisper)
+   * [Whisper-Timestamped](https://github.com/linto-ai/whisper-timestamped)
+* [HuggingFace Transformers](https://github.com/huggingface/transformers)
 * [SpeechBrain](https://github.com/speechbrain/speechbrain)
 * [TorchAudio](https://github.com/pytorch/audio)
-* [HuggingFace Transformers](https://github.com/huggingface/transformers)
\ No newline at end of file
diff --git a/whisper/RELEASE.md b/whisper/RELEASE.md
index 2967139..4d46f19 100644
--- a/whisper/RELEASE.md
+++ b/whisper/RELEASE.md
@@ -1,3 +1,7 @@
+# 1.0.1
+- support of model.safetensors
+- ct2/faster_whisper: Information about used precision added in the logs
+
 # 1.0.0
 - First build of linto-stt-whisper
 - Based on 4.0.5 of linto-stt https://github.com/linto-ai/linto-stt/blob/a54b7b7ac2bc491a1795bb6dfb318a39c8b76d63/RELEASE.md
diff --git a/whisper/requirements.ctranslate2.txt b/whisper/requirements.ctranslate2.txt
index 2ddc118..530dcff 100644
--- a/whisper/requirements.ctranslate2.txt
+++ b/whisper/requirements.ctranslate2.txt
@@ -2,7 +2,7 @@ celery[redis,auth,msgpack]>=4.4.7
 flask>=1.1.2
 flask-cors>=3.0.10
 flask-sock
-flask-swagger-ui>=3.36.0
+flask-swagger-ui==3.36.0
 gevent
 gunicorn
 lockfile
diff --git a/whisper/requirements.torch.txt b/whisper/requirements.torch.txt
index 75e747c..3976414 100644
--- a/whisper/requirements.torch.txt
+++ b/whisper/requirements.torch.txt
@@ -2,7 +2,7 @@ celery[redis,auth,msgpack]>=4.4.7
 flask>=1.1.2
 flask-cors>=3.0.10
 flask-sock
-flask-swagger-ui>=3.36.0
+flask-swagger-ui==3.36.0
 gevent
 gunicorn
 lockfile
@@ -13,7 +13,6 @@ speechbrain
 transformers
 wavio>=0.0.4
 websockets
-# openai-whisper
-git+https://github.com/linto-ai/whisper-timestamped.git
+whisper-timestamped
 onnxruntime
 torchaudio
\ No newline at end of file
diff --git a/whisper/stt/processing/load_model.py b/whisper/stt/processing/load_model.py
index b87a414..c3f1e88 100644
--- a/whisper/stt/processing/load_model.py
+++ b/whisper/stt/processing/load_model.py
@@ -65,20 +65,41 @@ def load_whisper_model(model_type_or_file, device="cpu", download_root=None):
             )
             logger.info(f"CTranslate2 model in {output_dir}")
             if not os.path.isdir(output_dir):
-                import huggingface_hub
+                from transformers.utils import cached_file
+                import json
 
+                kwargs = dict(cache_dir=download_root, use_auth_token=None, revision=None)
                 delete_hf_path = False
                 if not os.path.isdir(model_type_or_file):
-                    hf_path = huggingface_hub.hf_hub_download(
-                        repo_id=model_type_or_file, filename="pytorch_model.bin"
-                    )
+                    model_path = None
+                    hf_path = None
+                    for candidate in ["pytorch_model.bin", "model.safetensors", "whisper.ckpt", "pytorch_model.bin.index.json", "model.safetensors.index.json"]:
+                        try:
+                            hf_path = model_path = cached_file(model_type_or_file, candidate, **kwargs)
+                        except OSError:
+                            continue
+                        if candidate.endswith("index.json"):
+                            index_file = model_path
+                            mapping = json.load(open(index_file))
+                            assert "weight_map" in mapping
+                            assert isinstance(mapping["weight_map"], dict)
+                            model_path = list(set(mapping["weight_map"].values()))
+                            folder = os.path.dirname(index_file)
+                            model_path = [os.path.join(folder, p) for p in model_path]
+                        break
+                    if model_path is None:
+                        raise RuntimeError(f"Could not find model {model_type_or_file} from HuggingFace nor local folders.")
                     hf_path = os.path.dirname(os.path.dirname(os.path.dirname(hf_path)))
-
                     delete_hf_path = not os.path.exists(hf_path)
                 else:
-                    assert os.path.isfile(
-                        os.path.join(model_type_or_file, "pytorch_model.bin")
-                    ), f"Could not find pytorch_model.bin in {model_type_or_file}"
+                    hf_path = None
+                    for candidate in ["pytorch_model.bin", "model.safetensors", "whisper.ckpt", "pytorch_model.bin.index.json", "model.safetensors.index.json"]:
+                        model_path = os.path.join(model_type_or_file, candidate)
+                        if os.path.exists(model_path):
+                            hf_path = model_path
+                            break
+                    if hf_path is None:
+                        raise RuntimeError(f"Could not find pytorch_model.bin in {model_type_or_file}")
 
                 check_torch_installed()
 
@@ -135,6 +156,7 @@ def load_whisper_model(model_type_or_file, device="cpu", download_root=None):
                     # num_workers=1,
                     # download_root=os.path.join(download_root, f"huggingface/hub/models--guillaumekln--faster-whisper-{model_type_or_file}"),
                 )
+                logger.info(f"Whisper model loaded with compute_type={compute_type}. (t={time.time() - start}s)")
                 break
             except ValueError as err:
                 logger.info(