From 02c532896e850da0ef2d61db68ea72d432903d1f Mon Sep 17 00:00:00 2001 From: Chad Bailey Date: Wed, 3 Apr 2024 20:21:11 +0000 Subject: [PATCH] webrtc VAD neds a different sample size --- src/dailyai/transports/daily_transport.py | 10 +++++----- src/dailyai/transports/threaded_transport.py | 11 ++++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/dailyai/transports/daily_transport.py b/src/dailyai/transports/daily_transport.py index 68593db76..8e8760d5b 100644 --- a/src/dailyai/transports/daily_transport.py +++ b/src/dailyai/transports/daily_transport.py @@ -27,7 +27,6 @@ from dailyai.transports.threaded_transport import ThreadedTransport -SAMPLE_RATE = 16000 NUM_CHANNELS = 1 SPEECH_THRESHOLD = 0.90 @@ -91,7 +90,7 @@ def __init__( self.webrtc_vad = Daily.create_native_vad( reset_period_ms=VAD_RESET_PERIOD_MS, - sample_rate=SAMPLE_RATE, + sample_rate=self._speaker_sample_rate, channels=NUM_CHANNELS ) @@ -117,14 +116,15 @@ def _patch_method(self, event_name, *args, **kwargs): def _webrtc_vad_analyze(self): buffer = self.read_audio_frames( - int(self._speaker_sample_rate / 100)) + int(self._vad_samples)) if len(buffer) > 0: confidence = self.webrtc_vad.analyze_frames(buffer) # yeses = int(confidence * 20.0) # nos = 20 - yeses # out = "!" * yeses + "." * nos - # print(f"!!! confidence: {out}") - return confidence > 0.90 + # print(f"!!! confidence: {out} {confidence}") + talking = confidence > 0.90 + return talking def add_event_handler(self, event_name: str, handler): if not event_name.startswith("on_"): diff --git a/src/dailyai/transports/threaded_transport.py b/src/dailyai/transports/threaded_transport.py index a68992af6..9d44bf409 100644 --- a/src/dailyai/transports/threaded_transport.py +++ b/src/dailyai/transports/threaded_transport.py @@ -41,9 +41,6 @@ def int2float(sound): return sound -SAMPLE_RATE = 16000 - - class VADState(Enum): QUIET = 1 STARTING = 2 @@ -81,17 +78,18 @@ def __init__( repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False ) print(f"!!! Loaded Silero VAD") + self._vad_samples = 1536 except ModuleNotFoundError as e: if self._has_webrtc_vad: print(f"Couldn't load torch; using webrtc VAD") + self._vad_samples = int(self._speaker_sample_rate / 100.0) else: print(f"Exception: {e}") print("In order to use VAD, you'll need to install the `torch` and `torchaudio` modules.") raise Exception(f"Missing module(s): {e}") - self._vad_samples = 1536 - vad_frame_s = self._vad_samples / SAMPLE_RATE + vad_frame_s = self._vad_samples / self._speaker_sample_rate self._vad_start_frames = round(self._vad_start_s / vad_frame_s) self._vad_stop_frames = round(self._vad_stop_s / vad_frame_s) self._vad_starting_count = 0 @@ -314,6 +312,7 @@ def _vad(self): self._vad_state == VADState.STARTING and self._vad_starting_count >= self._vad_start_frames ): + print(f"!!! !!! STARTED SPEAKING") if self._loop: asyncio.run_coroutine_threadsafe( self.receive_queue.put( @@ -325,6 +324,8 @@ def _vad(self): self._vad_state == VADState.STOPPING and self._vad_stopping_count >= self._vad_stop_frames ): + print(f"!!! !!! STOPPED SPEAKING") + if self._loop: asyncio.run_coroutine_threadsafe( self.receive_queue.put(