Skip to content

Commit

Permalink
Merge pull request #215 from pipecat-ai/aleix/silero-vad-memory-fix
Browse files Browse the repository at this point in the history
vad(silero): fix memory issue
  • Loading branch information
aconchillo authored Jun 5, 2024
2 parents 2d1ed9a + 3dd4bac commit cd60a84
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 1 deletion.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ All notable changes to **pipecat** will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.0.28] - 2024-06-05

### Fixed

- Fixed an issue with `SileroVADAnalyzer` that would cause memory to keep
growing indefinitely.

## [0.0.27] - 2024-06-05

### Added
Expand Down
18 changes: 17 additions & 1 deletion src/pipecat/vad/silero.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
# SPDX-License-Identifier: BSD 2-Clause License
#

import time

import numpy as np

from pipecat.frames.frames import AudioRawFrame, Frame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame
Expand All @@ -25,6 +27,9 @@
logger.error("In order to use Silero VAD, you need to `pip install pipecat-ai[silero]`.")
raise Exception(f"Missing module(s): {e}")

# How often should we reset internal model state
_MODEL_RESET_STATES_TIME = 5.0


class SileroVADAnalyzer(VADAnalyzer):

Expand All @@ -33,10 +38,12 @@ def __init__(self, sample_rate=16000, params: VADParams = VADParams()):

logger.debug("Loading Silero VAD model...")

(self._model, self._utils) = torch.hub.load(
(self._model, utils) = torch.hub.load(
repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False
)

self._last_reset_time = 0

logger.debug("Loaded Silero VAD")

#
Expand All @@ -52,6 +59,15 @@ def voice_confidence(self, buffer) -> float:
# Divide by 32768 because we have signed 16-bit data.
audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0
new_confidence = self._model(torch.from_numpy(audio_float32), self.sample_rate).item()

# We need to reset the model from time to time because it doesn't
# really need all the data and memory will keep growing otherwise.
curr_time = time.time()
diff_time = curr_time - self._last_reset_time
if diff_time >= _MODEL_RESET_STATES_TIME:
self._model.reset_states()
self._last_reset_time = curr_time

return new_confidence
except BaseException as e:
# This comes from an empty audio array
Expand Down

0 comments on commit cd60a84

Please sign in to comment.