diff --git a/Audio-Transcription-Chrome/content.js b/Audio-Transcription-Chrome/content.js
index 1c89a2a8..3c911e48 100644
--- a/Audio-Transcription-Chrome/content.js
+++ b/Audio-Transcription-Chrome/content.js
@@ -59,7 +59,7 @@ function init_element() {
elem_container = document.createElement('div');
elem_container.id = "transcription";
- elem_container.style.cssText = 'padding-top:16px;font-size:18px;line-height:18px;top:0px;position:absolute;width:500px;height:90px;opacity:0.9;z-index:100;background:black;border-radius:10px;color:white;';
+ elem_container.style.cssText = 'padding-top:16px;font-size:18px;position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%);z-index: 9999;line-height:18px;width:500px;height:90px;opacity:0.9;z-index:100;background:black;border-radius:10px;color:white;';
for (var i = 0; i < 4; i++) {
elem_text = document.createElement('span');
diff --git a/Audio-Transcription-Chrome/options.js b/Audio-Transcription-Chrome/options.js
index 7e8ae5b2..435d75b2 100644
--- a/Audio-Transcription-Chrome/options.js
+++ b/Audio-Transcription-Chrome/options.js
@@ -93,14 +93,10 @@ async function startRecord(option) {
const socket = new WebSocket(`ws://${option.host}:${option.port}/`);
let isServerReady = false;
let language = option.language;
- if (language === null && !option.multilingual) {
- language = 'en';
- }
socket.onopen = function(e) {
socket.send(
JSON.stringify({
uid: uuid,
- multilingual: option.multilingual,
language: option.language,
task: option.task,
model: option.modelSize
diff --git a/Audio-Transcription-Chrome/popup.html b/Audio-Transcription-Chrome/popup.html
index 3fae5484..8d45bcb2 100644
--- a/Audio-Transcription-Chrome/popup.html
+++ b/Audio-Transcription-Chrome/popup.html
@@ -15,112 +15,109 @@
-
@@ -134,11 +131,15 @@
diff --git a/Audio-Transcription-Chrome/popup.js b/Audio-Transcription-Chrome/popup.js
index 9a708d12..3c40aa8b 100644
--- a/Audio-Transcription-Chrome/popup.js
+++ b/Audio-Transcription-Chrome/popup.js
@@ -4,7 +4,6 @@ document.addEventListener("DOMContentLoaded", function () {
const stopButton = document.getElementById("stopCapture");
const useServerCheckbox = document.getElementById("useServerCheckbox");
- const useMultilingualCheckbox = document.getElementById('useMultilingualCheckbox');
const languageDropdown = document.getElementById('languageDropdown');
const taskDropdown = document.getElementById('taskDropdown');
const modelSizeDropdown = document.getElementById('modelSizeDropdown');
@@ -32,14 +31,6 @@ document.addEventListener("DOMContentLoaded", function () {
}
});
- chrome.storage.local.get("useMultilingualModelState", ({ useMultilingualModelState }) => {
- if (useMultilingualModelState !== undefined) {
- useMultilingualCheckbox.checked = useMultilingualModelState;
- languageDropdown.disabled = !useMultilingualModelState;
- taskDropdown.disabled = !useMultilingualModelState;
- }
- });
-
chrome.storage.local.get("selectedLanguage", ({ selectedLanguage: storedLanguage }) => {
if (storedLanguage !== undefined) {
languageDropdown.value = storedLanguage;
@@ -86,7 +77,6 @@ document.addEventListener("DOMContentLoaded", function () {
tabId: currentTab.id,
host: host,
port: port,
- useMultilingual: useMultilingualCheckbox.checked,
language: selectedLanguage,
task: selectedTask,
modelSize: selectedModelSize
@@ -129,9 +119,8 @@ document.addEventListener("DOMContentLoaded", function () {
startButton.disabled = isCapturing;
stopButton.disabled = !isCapturing;
useServerCheckbox.disabled = isCapturing;
- useMultilingualCheckbox.disabled = isCapturing;
modelSizeDropdown.disabled = isCapturing;
-
+ taskDropdown.disabled = isCapturing;
startButton.classList.toggle("disabled", isCapturing);
stopButton.classList.toggle("disabled", !isCapturing);
}
@@ -142,18 +131,6 @@ document.addEventListener("DOMContentLoaded", function () {
chrome.storage.local.set({ useServerState });
});
- useMultilingualCheckbox.addEventListener('change', function() {
- const useMultilingualModelState = useMultilingualCheckbox.checked;
- if (useMultilingualModelState) {
- languageDropdown.disabled = false;
- taskDropdown.disabled = false;
- } else {
- languageDropdown.disabled = true;
- taskDropdown.disabled = true;
- }
- chrome.storage.local.set({ useMultilingualModelState });
- });
-
languageDropdown.addEventListener('change', function() {
if (languageDropdown.value === "") {
selectedLanguage = null;
diff --git a/whisper_live/client.py b/whisper_live/client.py
index 3832556b..46d52300 100644
--- a/whisper_live/client.py
+++ b/whisper_live/client.py
@@ -76,7 +76,6 @@ def __init__(
self,
host=None,
port=None,
- is_multilingual=False,
lang=None,
translate=False,
model="small",
@@ -92,8 +91,7 @@ def __init__(
Args:
host (str): The hostname or IP address of the server.
port (int): The port number for the WebSocket server.
- is_multilingual (bool, optional): Specifies if multilingual transcription is enabled. Default is False.
- lang (str, optional): The selected language for transcription when multilingual is disabled. Default is None.
+ lang (str, optional): The selected language for transcription. Default is None.
translate (bool, optional): Specifies if the task is translation. Default is False.
"""
self.chunk = 4096
@@ -102,14 +100,11 @@ def __init__(
self.rate = 16000
self.record_seconds = 60000
self.recording = False
- self.multilingual = False
- self.language = None
self.task = "transcribe"
self.uid = str(uuid.uuid4())
self.waiting = False
self.last_response_recieved = None
self.disconnect_if_no_response_for = 15
- self.multilingual = is_multilingual
self.language = lang
self.model = model
self.server_error = False
@@ -247,7 +242,7 @@ def on_open(self, ws):
"""
Callback function called when the WebSocket connection is successfully opened.
- Sends an initial configuration message to the server, including client UID, multilingual mode,
+ Sends an initial configuration message to the server, including client UID,
language selection, and task type.
Args:
@@ -261,7 +256,6 @@ def on_open(self, ws):
json.dumps(
{
"uid": self.uid,
- "multilingual": self.multilingual,
"language": self.language,
"task": self.task,
"model": self.model,
@@ -548,8 +542,7 @@ class TranscriptionClient:
Args:
host (str): The hostname or IP address of the server.
port (int): The port number to connect to on the server.
- is_multilingual (bool, optional): Indicates whether the transcription should support multiple languages (default is False).
- lang (str, optional): The primary language for transcription (used if `is_multilingual` is False). Default is None, which defaults to English ('en').
+ lang (str, optional): The primary language for transcription. Default is None, which defaults to English ('en').
translate (bool, optional): Indicates whether translation tasks are required (default is False).
Attributes:
@@ -558,19 +551,18 @@ class TranscriptionClient:
Example:
To create a TranscriptionClient and start transcription on microphone audio:
```python
- transcription_client = TranscriptionClient(host="localhost", port=9090, is_multilingual=True)
+ transcription_client = TranscriptionClient(host="localhost", port=9090)
transcription_client()
```
"""
def __init__(self,
host,
port,
- is_multilingual=False,
lang=None,
translate=False,
model="small",
):
- self.client = Client(host, port, is_multilingual, lang, translate, model)
+ self.client = Client(host, port, lang, translate, model)
def __call__(self, audio=None, hls_url=None):
"""
diff --git a/whisper_live/server.py b/whisper_live/server.py
index b189576b..4a8a1d8d 100644
--- a/whisper_live/server.py
+++ b/whisper_live/server.py
@@ -136,6 +136,7 @@ def recv_audio(self,
)
logging.info(f"Running TensorRT backend.")
except Exception as e:
+ self.client_uid = options["uid"]
websocket.send(
json.dumps(
{
@@ -154,7 +155,6 @@ def recv_audio(self,
options["model"] = faster_whisper_custom_model_path
client = ServeClientFasterWhisper(
websocket,
- multilingual=options["multilingual"],
language=options["language"],
task=options["task"],
client_uid=options["uid"],
@@ -558,7 +558,6 @@ def __init__(
websocket,
task="transcribe",
device=None,
- multilingual=False,
language=None,
client_uid=None,
model="small.en",
@@ -575,7 +574,6 @@ def __init__(
websocket (WebSocket): The WebSocket connection for the client.
task (str, optional): The task type, e.g., "transcribe." Defaults to "transcribe".
device (str, optional): The device type for Whisper, "cuda" or "cpu". Defaults to None.
- multilingual (bool, optional): Whether the client supports multilingual transcription. Defaults to False.
language (str, optional): The language for transcription. Defaults to None.
client_uid (str, optional): A unique identifier for the client. Defaults to None.
@@ -585,13 +583,11 @@ def __init__(
"tiny", "tiny.en", "base", "base.en", "small", "small.en",
"medium", "medium.en", "large-v2", "large-v3",
]
-
- self.multilingual = multilingual
if not os.path.exists(model):
- self.model_size_or_path = self.get_model_size(model)
+ self.model_size_or_path = self.check_valid_model(model)
else:
self.model_size_or_path = model
- self.language = language if self.multilingual else "en"
+ self.language = "en" if self.model_size_or_path.endswith("en") else language
self.task = task
self.initial_prompt = initial_prompt
self.vad_parameters = vad_parameters or {"threshold": 0.5}
@@ -601,7 +597,7 @@ def __init__(
if self.model_size_or_path == None:
return
-
+
self.transcriber = WhisperModel(
self.model_size_or_path,
device=device,
@@ -622,9 +618,15 @@ def __init__(
)
)
- def get_model_size(self, model_size):
+ def check_valid_model(self, model_size):
"""
- Returns the whisper model size based on multilingual.
+ Check if it's a valid whisper model size.
+
+ Args:
+ model_size (str): The name of the model size to check.
+
+ Returns:
+ str: The model size if valid, None otherwise.
"""
if model_size not in self.model_sizes:
self.websocket.send(
@@ -637,15 +639,6 @@ def get_model_size(self, model_size):
)
)
return None
-
- if model_size.endswith("en") and self.multilingual:
- logging.info(f"Setting multilingual to false with {model_size} which is english only model.")
- self.multilingual = False
-
- if not model_size.endswith("en") and not self.multilingual:
- logging.info(f"Setting multilingual to true with multilingual model {model_size}.")
- self.multilingual = True
-
return model_size
def speech_to_text(self):