Merge pull request #215 from pipecat-ai/aleix/silero-vad-memory-fix

vad(silero): fix memory issue
pipecat-ai · Jun 5, 2024 · cd60a84 · cd60a84
2 parents 2d1ed9a + 3dd4bac
commit cd60a84
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to **pipecat** will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.0.28] - 2024-06-05
+
+### Fixed
+
+- Fixed an issue with `SileroVADAnalyzer` that would cause memory to keep
+  growing indefinitely.
+
 ## [0.0.27] - 2024-06-05
 
 ### Added

diff --git a/src/pipecat/vad/silero.py b/src/pipecat/vad/silero.py
@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
+import time
+
 import numpy as np
 
 from pipecat.frames.frames import AudioRawFrame, Frame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame
@@ -25,6 +27,9 @@
     logger.error("In order to use Silero VAD, you need to `pip install pipecat-ai[silero]`.")
     raise Exception(f"Missing module(s): {e}")
 
+# How often should we reset internal model state
+_MODEL_RESET_STATES_TIME = 5.0
+
 
 class SileroVADAnalyzer(VADAnalyzer):
 
@@ -33,10 +38,12 @@ def __init__(self, sample_rate=16000, params: VADParams = VADParams()):
 
         logger.debug("Loading Silero VAD model...")
 
-        (self._model, self._utils) = torch.hub.load(
+        (self._model, utils) = torch.hub.load(
             repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False
         )
 
+        self._last_reset_time = 0
+
         logger.debug("Loaded Silero VAD")
 
     #
@@ -52,6 +59,15 @@ def voice_confidence(self, buffer) -> float:
             # Divide by 32768 because we have signed 16-bit data.
             audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0
             new_confidence = self._model(torch.from_numpy(audio_float32), self.sample_rate).item()
+
+            # We need to reset the model from time to time because it doesn't
+            # really need all the data and memory will keep growing otherwise.
+            curr_time = time.time()
+            diff_time = curr_time - self._last_reset_time
+            if diff_time >= _MODEL_RESET_STATES_TIME:
+                self._model.reset_states()
+                self._last_reset_time = curr_time
+
             return new_confidence
         except BaseException as e:
             # This comes from an empty audio array