diff --git a/CHANGELOG.md b/CHANGELOG.md index d46b012c3..5396dcf42 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ All notable changes to **pipecat** will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.0.28] - 2024-06-05 + +### Fixed + +- Fixed an issue with `SileroVADAnalyzer` that would cause memory to keep + growing indefinitely. + ## [0.0.27] - 2024-06-05 ### Added diff --git a/src/pipecat/vad/silero.py b/src/pipecat/vad/silero.py index 78ea4f9b7..97d0a2144 100644 --- a/src/pipecat/vad/silero.py +++ b/src/pipecat/vad/silero.py @@ -4,6 +4,8 @@ # SPDX-License-Identifier: BSD 2-Clause License # +import time + import numpy as np from pipecat.frames.frames import AudioRawFrame, Frame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame @@ -25,6 +27,9 @@ logger.error("In order to use Silero VAD, you need to `pip install pipecat-ai[silero]`.") raise Exception(f"Missing module(s): {e}") +# How often should we reset internal model state +_MODEL_RESET_STATES_TIME = 5.0 + class SileroVADAnalyzer(VADAnalyzer): @@ -33,10 +38,12 @@ def __init__(self, sample_rate=16000, params: VADParams = VADParams()): logger.debug("Loading Silero VAD model...") - (self._model, self._utils) = torch.hub.load( + (self._model, utils) = torch.hub.load( repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False ) + self._last_reset_time = 0 + logger.debug("Loaded Silero VAD") # @@ -52,6 +59,15 @@ def voice_confidence(self, buffer) -> float: # Divide by 32768 because we have signed 16-bit data. audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0 new_confidence = self._model(torch.from_numpy(audio_float32), self.sample_rate).item() + + # We need to reset the model from time to time because it doesn't + # really need all the data and memory will keep growing otherwise. + curr_time = time.time() + diff_time = curr_time - self._last_reset_time + if diff_time >= _MODEL_RESET_STATES_TIME: + self._model.reset_states() + self._last_reset_time = curr_time + return new_confidence except BaseException as e: # This comes from an empty audio array