From 71d207a607e0f5ac55f53457de150e6fd642f910 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8D=8E=E6=99=A8?= Date: Sat, 30 Dec 2023 23:19:30 +0800 Subject: [PATCH 1/3] feat: add large-v3 model --- Audio-Transcription-Chrome/popup.html | 1 + Audio-Transcription-Firefox/popup.html | 1 + whisper_live/server.py | 4 ++-- whisper_live/transcriber.py | 4 ++-- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Audio-Transcription-Chrome/popup.html b/Audio-Transcription-Chrome/popup.html index 79f18ca9..3fae5484 100644 --- a/Audio-Transcription-Chrome/popup.html +++ b/Audio-Transcription-Chrome/popup.html @@ -140,6 +140,7 @@ + diff --git a/Audio-Transcription-Firefox/popup.html b/Audio-Transcription-Firefox/popup.html index 2755a762..30184a59 100644 --- a/Audio-Transcription-Firefox/popup.html +++ b/Audio-Transcription-Firefox/popup.html @@ -142,6 +142,7 @@ + diff --git a/whisper_live/server.py b/whisper_live/server.py index 1fbd531a..5106be63 100644 --- a/whisper_live/server.py +++ b/whisper_live/server.py @@ -211,7 +211,7 @@ def __init__( self.data = b"" self.frames = b"" self.model_sizes = [ - "tiny", "base", "small", "medium", "large-v2" + "tiny", "base", "small", "medium", "large-v2", "large-v3" ] self.multilingual = multilingual self.model_size = self.get_model_size(model_size) @@ -277,7 +277,7 @@ def get_model_size(self, model_size): ) return None - if model_size == "large-v2": + if model_size in ["large-v2", "large-v3"]: self.multilingual = True return model_size diff --git a/whisper_live/transcriber.py b/whisper_live/transcriber.py index a275878b..918dc2ae 100644 --- a/whisper_live/transcriber.py +++ b/whisper_live/transcriber.py @@ -94,7 +94,7 @@ def __init__( Args: model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en, - small, small.en, medium, medium.en, large-v1, large-v2, or large), a path to a converted + small, small.en, medium, medium.en, large-v1, large-v2, large-v3, or large), a path to a converted model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub. When a size or a model ID is configured, the converted model is downloaded from the Hugging Face Hub. @@ -914,7 +914,7 @@ def find_alignment( words, word_tokens, start_times, end_times, word_probabilities ) ] - + def destroy(self): del self.model From 5918b5ed4247a7c82050c6aa5a2ae88f1463278f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8D=8E=E6=99=A8?= Date: Sun, 31 Dec 2023 15:45:41 +0800 Subject: [PATCH 2/3] fix: update faster-whisper --- requirements/server.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/server.txt b/requirements/server.txt index 0b19eaa9..62a292e7 100644 --- a/requirements/server.txt +++ b/requirements/server.txt @@ -1,5 +1,5 @@ PyAudio -faster-whisper==0.9.0 +faster-whisper==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu111 torch==1.10.1 torchaudio==0.10.1 diff --git a/setup.py b/setup.py index 05aad9d8..ecc040cc 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ ), install_requires=[ "PyAudio", - "faster-whisper==0.9.0", + "faster-whisper==0.10.0", "torch", "torchaudio", "websockets", From 02793a93f80d73baeff95897bcfc533b47545bde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8D=8E=E6=99=A8?= Date: Mon, 1 Jan 2024 20:22:37 +0800 Subject: [PATCH 3/3] feat: Update transcriber to support large-v3 model with 128 mel filters --- whisper_live/transcriber.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/whisper_live/transcriber.py b/whisper_live/transcriber.py index 918dc2ae..cf301d3d 100644 --- a/whisper_live/transcriber.py +++ b/whisper_live/transcriber.py @@ -4,6 +4,8 @@ import logging import os import zlib +import json +from inspect import signature from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union @@ -144,7 +146,8 @@ def __init__( "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en") ) - self.feature_extractor = FeatureExtractor() + self.feat_kwargs = self._get_feature_kwargs(model_path) + self.feature_extractor = FeatureExtractor(**self.feat_kwargs) self.num_samples_per_token = self.feature_extractor.hop_length * 2 self.frames_per_second = ( self.feature_extractor.sampling_rate // self.feature_extractor.hop_length @@ -161,6 +164,22 @@ def supported_languages(self) -> List[str]: """The languages supported by the model.""" return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"] + def _get_feature_kwargs(self, model_path) -> dict: + preprocessor_config_file = os.path.join(model_path, "preprocessor_config.json") + config = {} + if os.path.isfile(preprocessor_config_file): + try: + with open(preprocessor_config_file, "r", encoding="utf-8") as json_file: + config = json.load(json_file) + valid_keys = signature(FeatureExtractor.__init__).parameters.keys() + config = {k: v for k, v in config.items() if k in valid_keys} + except json.JSONDecodeError as e: + self.logger.warning( + "Could not load preprocessor_config.json: %s", str(e) + ) + + return config + def transcribe( self, audio: Union[str, BinaryIO, np.ndarray],