From 076aebf3b6e48aff4cd1a79e730682e696a9497c Mon Sep 17 00:00:00 2001 From: makaveli10 Date: Fri, 12 Jan 2024 15:21:27 +0530 Subject: [PATCH 1/4] bump version v0.0.11 --- whisper_live/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whisper_live/__version__.py b/whisper_live/__version__.py index 92390534..54654ce2 100644 --- a/whisper_live/__version__.py +++ b/whisper_live/__version__.py @@ -1 +1 @@ -__version__="0.0.10" \ No newline at end of file +__version__="0.0.11" From 71d0fe69c66a79c793dd4e1facb8ff7d7f2ef62f Mon Sep 17 00:00:00 2001 From: makaveli10 Date: Mon, 15 Jan 2024 23:28:02 +0800 Subject: [PATCH 2/4] add option to use custom model --- run_server.py | 10 +++++++++- whisper_live/client.py | 24 +++++++++++++++++++++--- whisper_live/server.py | 41 +++++++++++++++++++++++++++++++---------- 3 files changed, 61 insertions(+), 14 deletions(-) diff --git a/run_server.py b/run_server.py index cac0f934..0c2d5de3 100644 --- a/run_server.py +++ b/run_server.py @@ -1,5 +1,13 @@ +import argparse from whisper_live.server import TranscriptionServer if __name__ == "__main__": server = TranscriptionServer() - server.run("0.0.0.0") + parser = argparse.ArgumentParser() + parser.add_argument('--model_path', type=str, default=None, help="Custom Faster Whisper Model") + args = parser.parse_args() + server.run( + "0.0.0.0", + 9090, + custom_model_path=args.model_path + ) diff --git a/whisper_live/client.py b/whisper_live/client.py index 018e0904..979e3254 100644 --- a/whisper_live/client.py +++ b/whisper_live/client.py @@ -50,7 +50,14 @@ class Client: INSTANCES = {} def __init__( - self, host=None, port=None, is_multilingual=False, lang=None, translate=False, model_size="small" + self, + host=None, + port=None, + is_multilingual=False, + lang=None, + translate=False, + model_size="small", + use_custom_model=False ): """ Initializes a Client instance for audio recording and streaming to a server. @@ -83,6 +90,8 @@ def __init__( self.language = lang self.model_size = model_size self.server_error = False + self.use_custom_model = use_custom_model + if translate: self.task = "translate" @@ -221,6 +230,7 @@ def on_open(self, ws): "language": self.language, "task": self.task, "model_size": self.model_size, + "use_custom_model": self.use_custom_model # if runnning your own server with a custom model } ) ) @@ -505,8 +515,16 @@ class TranscriptionClient: transcription_client() ``` """ - def __init__(self, host, port, is_multilingual=False, lang=None, translate=False, model_size="small"): - self.client = Client(host, port, is_multilingual, lang, translate, model_size) + def __init__(self, + host, + port, + is_multilingual=False, + lang=None, + translate=False, + model_size="small", + use_custom_model=False + ): + self.client = Client(host, port, is_multilingual, lang, translate, model_size, use_custom_model) def __call__(self, audio=None, hls_url=None): """ diff --git a/whisper_live/server.py b/whisper_live/server.py index df132c49..ddeaee3e 100644 --- a/whisper_live/server.py +++ b/whisper_live/server.py @@ -1,3 +1,4 @@ +import os import websockets import time import threading @@ -12,6 +13,8 @@ import torch import numpy as np import time +import functools + from whisper_live.transcriber import WhisperModel @@ -58,7 +61,7 @@ def get_wait_time(self): return wait_time / 60 - def recv_audio(self, websocket): + def recv_audio(self, websocket, custom_model_path=None): """ Receive audio chunks from a client in an infinite loop. @@ -95,6 +98,11 @@ def recv_audio(self, websocket): websocket.close() del websocket return + + # validate custom model + if options["use_custom_model"]: + if custom_model_path is None or not os.path.exists(custom_model_path): + options["use_custom_model"] = False client = ServeClient( websocket, @@ -102,9 +110,10 @@ def recv_audio(self, websocket): language=options["language"], task=options["task"], client_uid=options["uid"], - model_size=options["model_size"], + model_size_or_path=custom_model_path if options["use_custom_model"] else options["model_size"], initial_prompt=options.get("initial_prompt"), - vad_parameters=options.get("vad_parameters") + vad_parameters=options.get("vad_parameters"), + use_custom_model=options["use_custom_model"] ) self.clients[websocket] = client @@ -137,7 +146,7 @@ def recv_audio(self, websocket): del websocket break - def run(self, host, port=9090): + def run(self, host, port=9090, custom_model_path=None): """ Run the transcription server. @@ -145,7 +154,14 @@ def run(self, host, port=9090): host (str): The host address to bind the server. port (int): The port number to bind the server. """ - with serve(self.recv_audio, host, port) as server: + with serve( + functools.partial( + self.recv_audio, + custom_model_path=custom_model_path + ), + host, + port + ) as server: server.serve_forever() @@ -190,9 +206,10 @@ def __init__( multilingual=False, language=None, client_uid=None, - model_size="small", + model_size_or_path="small", initial_prompt=None, - vad_parameters=None + vad_parameters=None, + use_custom_model=False ): """ Initialize a ServeClient instance. @@ -216,7 +233,11 @@ def __init__( "tiny", "base", "small", "medium", "large-v2", "large-v3" ] self.multilingual = multilingual - self.model_size = self.get_model_size(model_size) + if not use_custom_model: + self.model_size_or_path = self.get_model_size(model_size_or_path) + else: + self.model_size_or_path = model_size_or_path + self.language = language if self.multilingual else "en" self.task = task self.websocket = websocket @@ -225,11 +246,11 @@ def __init__( device = "cuda" if torch.cuda.is_available() else "cpu" - if self.model_size == None: + if self.model_size_or_path == None: return self.transcriber = WhisperModel( - self.model_size, + self.model_size_or_path, device=device, compute_type="int8" if device=="cpu" else "float16", local_files_only=False, From c8103693241462d01d132e80046c33554efbcac2 Mon Sep 17 00:00:00 2001 From: makaveli10 Date: Mon, 15 Jan 2024 23:30:05 +0800 Subject: [PATCH 3/4] revert the default port of chrom/firefox extension to 9090 --- Audio-Transcription-Chrome/popup.js | 2 +- Audio-Transcription-Firefox/popup.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Audio-Transcription-Chrome/popup.js b/Audio-Transcription-Chrome/popup.js index a1dd210c..9a708d12 100644 --- a/Audio-Transcription-Chrome/popup.js +++ b/Audio-Transcription-Chrome/popup.js @@ -73,7 +73,7 @@ document.addEventListener("DOMContentLoaded", function () { // Send a message to the background script to start capturing let host = "localhost"; - let port = "5901"; + let port = "9090"; const useCollaboraServer = useServerCheckbox.checked; if (useCollaboraServer){ host = "transcription.kurg.org" diff --git a/Audio-Transcription-Firefox/popup.js b/Audio-Transcription-Firefox/popup.js index b6785dd9..17f89503 100644 --- a/Audio-Transcription-Firefox/popup.js +++ b/Audio-Transcription-Firefox/popup.js @@ -66,7 +66,7 @@ document.addEventListener("DOMContentLoaded", function() { startButton.addEventListener("click", function() { let host = "localhost"; - let port = "5901"; + let port = "9090"; const useCollaboraServer = useServerCheckbox.checked; if (useCollaboraServer){ From 881fd55776c0fb28cb7c1f7954fee0e223107ab9 Mon Sep 17 00:00:00 2001 From: makaveli10 Date: Thu, 18 Jan 2024 15:02:51 +0800 Subject: [PATCH 4/4] run server with custom model from args --- whisper_live/client.py | 14 +++++--------- whisper_live/server.py | 37 +++++++++++++++++++------------------ 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/whisper_live/client.py b/whisper_live/client.py index 979e3254..8b08d159 100644 --- a/whisper_live/client.py +++ b/whisper_live/client.py @@ -56,8 +56,7 @@ def __init__( is_multilingual=False, lang=None, translate=False, - model_size="small", - use_custom_model=False + model="small", ): """ Initializes a Client instance for audio recording and streaming to a server. @@ -88,9 +87,8 @@ def __init__( self.disconnect_if_no_response_for = 15 self.multilingual = is_multilingual self.language = lang - self.model_size = model_size + self.model = model self.server_error = False - self.use_custom_model = use_custom_model if translate: self.task = "translate" @@ -229,8 +227,7 @@ def on_open(self, ws): "multilingual": self.multilingual, "language": self.language, "task": self.task, - "model_size": self.model_size, - "use_custom_model": self.use_custom_model # if runnning your own server with a custom model + "model": self.model, } ) ) @@ -521,10 +518,9 @@ def __init__(self, is_multilingual=False, lang=None, translate=False, - model_size="small", - use_custom_model=False + model="small", ): - self.client = Client(host, port, is_multilingual, lang, translate, model_size, use_custom_model) + self.client = Client(host, port, is_multilingual, lang, translate, model) def __call__(self, audio=None, hls_url=None): """ diff --git a/whisper_live/server.py b/whisper_live/server.py index ddeaee3e..56e8a64d 100644 --- a/whisper_live/server.py +++ b/whisper_live/server.py @@ -6,7 +6,7 @@ import textwrap import logging -# logging.basicConfig(level = logging.INFO) +logging.basicConfig(level = logging.INFO) from websockets.sync.server import serve @@ -100,9 +100,9 @@ def recv_audio(self, websocket, custom_model_path=None): return # validate custom model - if options["use_custom_model"]: - if custom_model_path is None or not os.path.exists(custom_model_path): - options["use_custom_model"] = False + if custom_model_path is not None and os.path.exists(custom_model_path): + logging.info(f"Using custom model {custom_model_path}") + options["model"] = custom_model_path client = ServeClient( websocket, @@ -110,10 +110,9 @@ def recv_audio(self, websocket, custom_model_path=None): language=options["language"], task=options["task"], client_uid=options["uid"], - model_size_or_path=custom_model_path if options["use_custom_model"] else options["model_size"], + model=options["model"], initial_prompt=options.get("initial_prompt"), vad_parameters=options.get("vad_parameters"), - use_custom_model=options["use_custom_model"] ) self.clients[websocket] = client @@ -206,10 +205,9 @@ def __init__( multilingual=False, language=None, client_uid=None, - model_size_or_path="small", + model="small", initial_prompt=None, vad_parameters=None, - use_custom_model=False ): """ Initialize a ServeClient instance. @@ -230,13 +228,15 @@ def __init__( self.data = b"" self.frames = b"" self.model_sizes = [ - "tiny", "base", "small", "medium", "large-v2", "large-v3" + "tiny", "tiny.en", "base", "base.en", "small", "small.en", + "medium", "medium.en", "large-v2", "large-v3", ] + self.multilingual = multilingual - if not use_custom_model: - self.model_size_or_path = self.get_model_size(model_size_or_path) + if not os.path.exists(model): + self.model_size_or_path = self.get_model_size(model) else: - self.model_size_or_path = model_size_or_path + self.model_size_or_path = model self.language = language if self.multilingual else "en" self.task = task @@ -246,7 +246,7 @@ def __init__( device = "cuda" if torch.cuda.is_available() else "cpu" - if self.model_size_or_path == None: + if self.model_size_or_path is None: return self.transcriber = WhisperModel( @@ -302,12 +302,13 @@ def get_model_size(self, model_size): ) return None - if model_size in ["large-v2", "large-v3"]: + if model_size.endswith("en") and self.multilingual: + logging.info(f"Setting multilingual to false with {model_size} which is english only model.") + self.multilingual = False + + if not model_size.endswith("en") and not self.multilingual: + logging.info(f"Setting multilingual to true with multilingual model {model_size}.") self.multilingual = True - return model_size - - if not self.multilingual: - model_size = model_size + ".en" return model_size