webrtc VAD neds a different sample size

pipecat-ai · Apr 3, 2024 · 02c5328 · 02c5328
1 parent 3d03547
commit 02c5328
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 10 deletions.
diff --git a/src/dailyai/transports/daily_transport.py b/src/dailyai/transports/daily_transport.py
@@ -27,7 +27,6 @@
 
 from dailyai.transports.threaded_transport import ThreadedTransport
 
-SAMPLE_RATE = 16000
 NUM_CHANNELS = 1
 
 SPEECH_THRESHOLD = 0.90
@@ -91,7 +90,7 @@ def __init__(
 
         self.webrtc_vad = Daily.create_native_vad(
             reset_period_ms=VAD_RESET_PERIOD_MS,
-            sample_rate=SAMPLE_RATE,
+            sample_rate=self._speaker_sample_rate,
             channels=NUM_CHANNELS
         )
 
@@ -117,14 +116,15 @@ def _patch_method(self, event_name, *args, **kwargs):
 
     def _webrtc_vad_analyze(self):
         buffer = self.read_audio_frames(
-            int(self._speaker_sample_rate / 100))
+            int(self._vad_samples))
         if len(buffer) > 0:
             confidence = self.webrtc_vad.analyze_frames(buffer)
             # yeses = int(confidence * 20.0)
             # nos = 20 - yeses
             # out = "!" * yeses + "." * nos
-            # print(f"!!! confidence: {out}")
-            return confidence > 0.90
+            # print(f"!!! confidence: {out} {confidence}")
+            talking = confidence > 0.90
+            return talking
 
     def add_event_handler(self, event_name: str, handler):
         if not event_name.startswith("on_"):

diff --git a/src/dailyai/transports/threaded_transport.py b/src/dailyai/transports/threaded_transport.py
@@ -41,9 +41,6 @@ def int2float(sound):
     return sound
 
 
-SAMPLE_RATE = 16000
-
-
 class VADState(Enum):
     QUIET = 1
     STARTING = 2
@@ -81,17 +78,18 @@ def __init__(
                     repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False
                 )
                 print(f"!!! Loaded Silero VAD")
+                self._vad_samples = 1536
 
             except ModuleNotFoundError as e:
                 if self._has_webrtc_vad:
                     print(f"Couldn't load torch; using webrtc VAD")
+                    self._vad_samples = int(self._speaker_sample_rate / 100.0)
                 else:
                     print(f"Exception: {e}")
                     print("In order to use VAD, you'll need to install the `torch` and `torchaudio` modules.")
                     raise Exception(f"Missing module(s): {e}")
 
-        self._vad_samples = 1536
-        vad_frame_s = self._vad_samples / SAMPLE_RATE
+        vad_frame_s = self._vad_samples / self._speaker_sample_rate
         self._vad_start_frames = round(self._vad_start_s / vad_frame_s)
         self._vad_stop_frames = round(self._vad_stop_s / vad_frame_s)
         self._vad_starting_count = 0
@@ -314,6 +312,7 @@ def _vad(self):
                 self._vad_state == VADState.STARTING
                 and self._vad_starting_count >= self._vad_start_frames
             ):
+                print(f"!!! !!! STARTED SPEAKING")
                 if self._loop:
                     asyncio.run_coroutine_threadsafe(
                         self.receive_queue.put(
@@ -325,6 +324,8 @@ def _vad(self):
                 self._vad_state == VADState.STOPPING
                 and self._vad_stopping_count >= self._vad_stop_frames
             ):
+                print(f"!!! !!! STOPPED SPEAKING")
+
                 if self._loop:
                     asyncio.run_coroutine_threadsafe(
                         self.receive_queue.put(