From 02c532896e850da0ef2d61db68ea72d432903d1f Mon Sep 17 00:00:00 2001
From: Chad Bailey <chadbailey@gmail.com>
Date: Wed, 3 Apr 2024 20:21:11 +0000
Subject: [PATCH] webrtc VAD neds a different sample size

---
 src/dailyai/transports/daily_transport.py    | 10 +++++-----
 src/dailyai/transports/threaded_transport.py | 11 ++++++-----
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/dailyai/transports/daily_transport.py b/src/dailyai/transports/daily_transport.py
index 68593db76..8e8760d5b 100644
--- a/src/dailyai/transports/daily_transport.py
+++ b/src/dailyai/transports/daily_transport.py
@@ -27,7 +27,6 @@
 
 from dailyai.transports.threaded_transport import ThreadedTransport
 
-SAMPLE_RATE = 16000
 NUM_CHANNELS = 1
 
 SPEECH_THRESHOLD = 0.90
@@ -91,7 +90,7 @@ def __init__(
 
         self.webrtc_vad = Daily.create_native_vad(
             reset_period_ms=VAD_RESET_PERIOD_MS,
-            sample_rate=SAMPLE_RATE,
+            sample_rate=self._speaker_sample_rate,
             channels=NUM_CHANNELS
         )
 
@@ -117,14 +116,15 @@ def _patch_method(self, event_name, *args, **kwargs):
 
     def _webrtc_vad_analyze(self):
         buffer = self.read_audio_frames(
-            int(self._speaker_sample_rate / 100))
+            int(self._vad_samples))
         if len(buffer) > 0:
             confidence = self.webrtc_vad.analyze_frames(buffer)
             # yeses = int(confidence * 20.0)
             # nos = 20 - yeses
             # out = "!" * yeses + "." * nos
-            # print(f"!!! confidence: {out}")
-            return confidence > 0.90
+            # print(f"!!! confidence: {out} {confidence}")
+            talking = confidence > 0.90
+            return talking
 
     def add_event_handler(self, event_name: str, handler):
         if not event_name.startswith("on_"):
diff --git a/src/dailyai/transports/threaded_transport.py b/src/dailyai/transports/threaded_transport.py
index a68992af6..9d44bf409 100644
--- a/src/dailyai/transports/threaded_transport.py
+++ b/src/dailyai/transports/threaded_transport.py
@@ -41,9 +41,6 @@ def int2float(sound):
     return sound
 
 
-SAMPLE_RATE = 16000
-
-
 class VADState(Enum):
     QUIET = 1
     STARTING = 2
@@ -81,17 +78,18 @@ def __init__(
                     repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False
                 )
                 print(f"!!! Loaded Silero VAD")
+                self._vad_samples = 1536
 
             except ModuleNotFoundError as e:
                 if self._has_webrtc_vad:
                     print(f"Couldn't load torch; using webrtc VAD")
+                    self._vad_samples = int(self._speaker_sample_rate / 100.0)
                 else:
                     print(f"Exception: {e}")
                     print("In order to use VAD, you'll need to install the `torch` and `torchaudio` modules.")
                     raise Exception(f"Missing module(s): {e}")
 
-        self._vad_samples = 1536
-        vad_frame_s = self._vad_samples / SAMPLE_RATE
+        vad_frame_s = self._vad_samples / self._speaker_sample_rate
         self._vad_start_frames = round(self._vad_start_s / vad_frame_s)
         self._vad_stop_frames = round(self._vad_stop_s / vad_frame_s)
         self._vad_starting_count = 0
@@ -314,6 +312,7 @@ def _vad(self):
                 self._vad_state == VADState.STARTING
                 and self._vad_starting_count >= self._vad_start_frames
             ):
+                print(f"!!! !!! STARTED SPEAKING")
                 if self._loop:
                     asyncio.run_coroutine_threadsafe(
                         self.receive_queue.put(
@@ -325,6 +324,8 @@ def _vad(self):
                 self._vad_state == VADState.STOPPING
                 and self._vad_stopping_count >= self._vad_stop_frames
             ):
+                print(f"!!! !!! STOPPED SPEAKING")
+
                 if self._loop:
                     asyncio.run_coroutine_threadsafe(
                         self.receive_queue.put(