Skip to content

Commit

Permalink
Merge pull request #147 from makaveli10/vad_option
Browse files Browse the repository at this point in the history
add VAD a client option
  • Loading branch information
makaveli10 authored Feb 21, 2024
2 parents c919ba3 + babe5de commit 1db94ea
Show file tree
Hide file tree
Showing 13 changed files with 122 additions and 57 deletions.
3 changes: 2 additions & 1 deletion Audio-Transcription-Chrome/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ async function startCapture(options) {
multilingual: options.useMultilingual,
language: options.language,
task: options.task,
modelSize: options.modelSize
modelSize: options.modelSize,
useVad: options.useVad,
},
});
} else {
Expand Down
3 changes: 2 additions & 1 deletion Audio-Transcription-Chrome/options.js
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ async function startRecord(option) {
uid: uuid,
language: option.language,
task: option.task,
model: option.modelSize
model: option.modelSize,
use_vad: option.useVad
})
);
};
Expand Down
4 changes: 4 additions & 0 deletions Audio-Transcription-Chrome/popup.html
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
<input type="checkbox" id="useServerCheckbox">
<label for="useServerCheckbox">Use Collabora Whisper-Live Server</label>
</div>
<div class="checkbox-container">
<input type="checkbox" id="useVadCheckbox">
<label for="useVadCheckbox">Use Voice Activity Detection</label>
</div>
<div class="dropdown-container">
<label for="languageDropdown">Select Language:</label>
<select id="languageDropdown">
Expand Down
18 changes: 16 additions & 2 deletions Audio-Transcription-Chrome/popup.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ document.addEventListener("DOMContentLoaded", function () {
const stopButton = document.getElementById("stopCapture");

const useServerCheckbox = document.getElementById("useServerCheckbox");
const useVadCheckbox = document.getElementById("useVadCheckbox");
const languageDropdown = document.getElementById('languageDropdown');
const taskDropdown = document.getElementById('taskDropdown');
const modelSizeDropdown = document.getElementById('modelSizeDropdown');
Expand Down Expand Up @@ -31,6 +32,12 @@ document.addEventListener("DOMContentLoaded", function () {
}
});

chrome.storage.local.get("useVadState", ({ useVadState }) => {
if (useVadState !== undefined) {
useVadCheckbox.checked = useVadState;
}
});

chrome.storage.local.get("selectedLanguage", ({ selectedLanguage: storedLanguage }) => {
if (storedLanguage !== undefined) {
languageDropdown.value = storedLanguage;
Expand Down Expand Up @@ -79,7 +86,8 @@ document.addEventListener("DOMContentLoaded", function () {
port: port,
language: selectedLanguage,
task: selectedTask,
modelSize: selectedModelSize
modelSize: selectedModelSize,
useVad: useVadCheckbox.checked,
}, () => {
// Update capturing state in storage and toggle the buttons
chrome.storage.local.set({ capturingState: { isCapturing: true } }, () => {
Expand Down Expand Up @@ -118,7 +126,8 @@ document.addEventListener("DOMContentLoaded", function () {
function toggleCaptureButtons(isCapturing) {
startButton.disabled = isCapturing;
stopButton.disabled = !isCapturing;
useServerCheckbox.disabled = isCapturing;
useServerCheckbox.disabled = isCapturing;
useVadCheckbox.disabled = isCapturing;
modelSizeDropdown.disabled = isCapturing;
languageDropdown.disabled = isCapturing;
taskDropdown.disabled = isCapturing;
Expand All @@ -132,6 +141,11 @@ document.addEventListener("DOMContentLoaded", function () {
chrome.storage.local.set({ useServerState });
});

useVadCheckbox.addEventListener("change", () => {
const useVadState = useVadCheckbox.checked;
chrome.storage.local.set({ useVadState });
});

languageDropdown.addEventListener('change', function() {
if (languageDropdown.value === "") {
selectedLanguage = null;
Expand Down
3 changes: 2 additions & 1 deletion Audio-Transcription-Firefox/content.js
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ function startRecording(data) {
uid: uuid,
language: data.language,
task: data.task,
model: data.modelSize
model: data.modelSize,
use_vad: data.useVad
})
);
};
Expand Down
4 changes: 4 additions & 0 deletions Audio-Transcription-Firefox/popup.html
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
<input type="checkbox" id="useServerCheckbox">
<label for="useServerCheckbox">Use Collabora Whisper-Live Server</label>
</div>
<div class="checkbox-container">
<input type="checkbox" id="useVadCheckbox">
<label for="useVadCheckbox">Use Voice Activity Detection</label>
</div>
<textarea id="waitTextBox" style="display: none;"></textarea>
<div class="dropdown-container">
<label for="languageDropdown">Select Language:</label>
Expand Down
16 changes: 15 additions & 1 deletion Audio-Transcription-Firefox/popup.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ document.addEventListener("DOMContentLoaded", function() {
const stopButton = document.getElementById("stopCapture");

const useServerCheckbox = document.getElementById("useServerCheckbox");
const useVadCheckbox = document.getElementById("useVadCheckbox");
const languageDropdown = document.getElementById('languageDropdown');
const taskDropdown = document.getElementById('taskDropdown');
const modelSizeDropdown = document.getElementById('modelSizeDropdown');
Expand Down Expand Up @@ -34,6 +35,12 @@ document.addEventListener("DOMContentLoaded", function() {
}
});

browser.storage.local.get("useVadState", ({ useVadState }) => {
if (useVadState !== undefined) {
useVadCheckbox.checked = useVadState;
}
});

browser.storage.local.get("selectedLanguage", ({ selectedLanguage: storedLanguage }) => {
if (storedLanguage !== undefined) {
languageDropdown.value = storedLanguage;
Expand Down Expand Up @@ -76,7 +83,8 @@ document.addEventListener("DOMContentLoaded", function() {
port: port,
language: selectedLanguage,
task: selectedTask,
modelSize: selectedModelSize
modelSize: selectedModelSize,
useVad: useVadCheckbox.checked,
}
});
toggleCaptureButtons(true);
Expand Down Expand Up @@ -115,6 +123,7 @@ document.addEventListener("DOMContentLoaded", function() {
startButton.disabled = isCapturing;
stopButton.disabled = !isCapturing;
useServerCheckbox.disabled = isCapturing;
useVadCheckbox.disabled = isCapturing;
modelSizeDropdown.disabled = isCapturing;
languageDropdown.disabled = isCapturing;
taskDropdown.disabled = isCapturing;
Expand All @@ -128,6 +137,11 @@ document.addEventListener("DOMContentLoaded", function() {
browser.storage.local.set({ useServerState });
});

useVadCheckbox.addEventListener("change", () => {
const useVadState = useVadCheckbox.checked;
browser.storage.local.set({ useVadState });
});

languageDropdown.addEventListener('change', function() {
if (languageDropdown.value === "") {
selectedLanguage = null;
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ client = TranscriptionClient(
9090,
lang="en",
translate=False,
model="small"
model="small",
use_vad=False,
)
```
It connects to the server running on localhost at port 9090. Using a multilingual model, language for the transcription will be automatically detected. You can also use the language option to specify the target language for the transcription, in this case, English ("en"). The translate option should be set to `True` if we want to translate from the source language to English and `False` if we want to transcribe in the source language.
Expand Down
4 changes: 2 additions & 2 deletions TensorRT_whisper.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ docker pull ghcr.io/collabora/whisperbot-base:latest
```bash
docker run -it --gpus all --shm-size=8g \
--ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
-v /path/to/WhisperLive:/home/WhisperLive \
-p 9090:9090 -v /path/to/WhisperLive:/home/WhisperLive \
ghcr.io/collabora/whisperbot-base:latest
```

Expand All @@ -48,7 +48,7 @@ bash scripts/build_whisper_tensorrt.sh /root/TensorRT-LLM-examples small
cd /home/WhisperLive

# Install requirements
bash scripts/setup.sh
apt update && bash scripts/setup.sh
pip install -r requirements/server.txt

# Required to create mel spectogram
Expand Down
1 change: 1 addition & 0 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def test_on_open(self):
"language": self.client.language,
"task": self.client.task,
"model": self.client.model,
"use_vad": True
})
self.client.on_open(self.mock_ws_app)
self.mock_ws_app.send.assert_called_with(expected_message)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def test_recv_audio_exception_handling(self, mock_websocket):
class TestServerInferenceAccuracy(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.server_process = subprocess.Popen(["python", "run_server.py"]) # Adjust the command as needed
cls.server_process = subprocess.Popen(["python", "run_server.py"])
time.sleep(2)

@classmethod
Expand Down Expand Up @@ -134,4 +134,4 @@ def test_unexpected_exception_handling(self, mock_websocket):
for message in log.output:
print(message)
print()
self.assertTrue(any("Unexpected error: Unexpected error" in message for message in log.output))
self.assertTrue(any("Unexpected error" in message for message in log.output))
19 changes: 14 additions & 5 deletions whisper_live/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class Client:
Handles audio recording, streaming, and communication with a server using WebSocket.
"""
INSTANCES = {}
END_OF_AUDIO = "END_OF_AUDIO"

def __init__(
self,
Expand All @@ -25,7 +26,8 @@ def __init__(
lang=None,
translate=False,
model="small",
srt_file_path="output.srt"
srt_file_path="output.srt",
use_vad=True
):
"""
Initializes a Client instance for audio recording and streaming to a server.
Expand Down Expand Up @@ -55,6 +57,8 @@ def __init__(
self.model = model
self.server_error = False
self.srt_file_path = srt_file_path
self.use_vad = use_vad
self.last_recieved_segment = None

if translate:
self.task = "translate"
Expand Down Expand Up @@ -120,6 +124,10 @@ def process_segments(self, segments):
(not self.transcript or
float(seg['start']) >= float(self.transcript[-1]['end']))):
self.transcript.append(seg)
# update last received segment and last valild responsne time
if self.last_recieved_segment is None or self.last_recieved_segment != segments[-1]["text"]:
self.last_response_recieved = time.time()
self.last_recieved_segment = segments[-1]["text"]

# Truncate to last 3 entries for brevity.
text = text[-3:]
Expand All @@ -139,7 +147,6 @@ def on_message(self, ws, message):
message (str): The received message from the server.
"""
self.last_response_recieved = time.time()
message = json.loads(message)

if self.uid != message.get("uid"):
Expand All @@ -155,6 +162,7 @@ def on_message(self, ws, message):
self.recording = False

if "message" in message.keys() and message["message"] == "SERVER_READY":
self.last_response_recieved = time.time()
self.recording = True
self.server_backend = message["backend"]
print(f"[INFO]: Server Running with backend {self.server_backend}")
Expand Down Expand Up @@ -201,6 +209,7 @@ def on_open(self, ws):
"language": self.language,
"task": self.task,
"model": self.model,
"use_vad": self.use_vad
}
)
)
Expand Down Expand Up @@ -275,7 +284,7 @@ def play_file(self, filename):
assert self.last_response_recieved
while time.time() - self.last_response_recieved < self.disconnect_if_no_response_for:
continue

self.send_packet_to_server(Client.END_OF_AUDIO.encode('utf-8'))
if self.server_backend == "faster_whisper":
self.write_srt_file(self.srt_file_path)
self.stream.close()
Expand Down Expand Up @@ -497,8 +506,8 @@ class TranscriptionClient:
transcription_client()
```
"""
def __init__(self, host, port, lang=None, translate=False, model="small"):
self.client = Client(host, port, lang, translate, model)
def __init__(self, host, port, lang=None, translate=False, model="small", use_vad=True):
self.client = Client(host, port, lang, translate, model, srt_file_path="output.srt", use_vad=use_vad)

def __call__(self, audio=None, hls_url=None):
"""
Expand Down
Loading

0 comments on commit 1db94ea

Please sign in to comment.