diff --git a/pyproject.toml b/pyproject.toml index 170ecd326..aebccda2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,7 @@ moondream = [ "einops~=0.8.0", "timm~=1.0.8", "transformers~=4.44.0" ] openai = [ "openai~=1.37.2" ] openpipe = [ "openpipe~=4.24.0" ] playht = [ "pyht~=0.0.28" ] -silero = [ "silero-vad~=5.1" ] +silero = [ "onnxruntime>=1.16.1" ] together = [ "together~=1.2.7" ] websocket = [ "websockets~=12.0", "fastapi~=0.112.1" ] whisper = [ "faster-whisper~=1.0.3" ] diff --git a/src/pipecat/vad/data/__init__.py b/src/pipecat/vad/data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/pipecat/vad/data/silero_vad.onnx b/src/pipecat/vad/data/silero_vad.onnx new file mode 100644 index 000000000..b3e3a900c Binary files /dev/null and b/src/pipecat/vad/data/silero_vad.onnx differ diff --git a/src/pipecat/vad/silero.py b/src/pipecat/vad/silero.py index 08d3534ea..c07ee2ae9 100644 --- a/src/pipecat/vad/silero.py +++ b/src/pipecat/vad/silero.py @@ -14,17 +14,94 @@ from loguru import logger +# How often should we reset internal model state +_MODEL_RESET_STATES_TIME = 5.0 + try: - from silero_vad import load_silero_vad - import torch + import onnxruntime except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error("In order to use Silero VAD, you need to `pip install pipecat-ai[silero]`.") raise Exception(f"Missing module(s): {e}") -# How often should we reset internal model state -_MODEL_RESET_STATES_TIME = 5.0 + +class SileroOnnxModel(): + + def __init__(self, path, force_onnx_cpu=True): + import numpy as np + global np + + opts = onnxruntime.SessionOptions() + opts.inter_op_num_threads = 1 + opts.intra_op_num_threads = 1 + + if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers(): + self.session = onnxruntime.InferenceSession( + path, providers=['CPUExecutionProvider'], sess_options=opts) + else: + self.session = onnxruntime.InferenceSession(path, sess_options=opts) + + self.reset_states() + self.sample_rates = [8000, 16000] + + def _validate_input(self, x, sr: int): + if np.ndim(x) == 1: + x = np.expand_dims(x, 0) + if np.ndim(x) > 2: + raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}") + + if sr not in self.sample_rates: + raise ValueError( + f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)") + if sr / np.shape(x)[1] > 31.25: + raise ValueError("Input audio chunk is too short") + + return x, sr + + def reset_states(self, batch_size=1): + self._state = np.zeros((2, batch_size, 128), dtype='float32') + self._context = np.zeros((batch_size, 0), dtype='float32') + self._last_sr = 0 + self._last_batch_size = 0 + + def __call__(self, x, sr: int): + + x, sr = self._validate_input(x, sr) + num_samples = 512 if sr == 16000 else 256 + + if np.shape(x)[-1] != num_samples: + raise ValueError( + f"Provided number of samples is {np.shape(x)[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)") + + batch_size = np.shape(x)[0] + context_size = 64 if sr == 16000 else 32 + + if not self._last_batch_size: + self.reset_states(batch_size) + if (self._last_sr) and (self._last_sr != sr): + self.reset_states(batch_size) + if (self._last_batch_size) and (self._last_batch_size != batch_size): + self.reset_states(batch_size) + + if not np.shape(self._context)[1]: + self._context = np.zeros((batch_size, context_size), dtype='float32') + + x = np.concatenate((self._context, x), axis=1) + + if sr in [8000, 16000]: + ort_inputs = {'input': x, 'state': self._state, 'sr': np.array(sr, dtype='int64')} + ort_outs = self.session.run(None, ort_inputs) + out, state = ort_outs + self._state = state + else: + raise ValueError() + + self._context = x[..., -context_size:] + self._last_sr = sr + self._last_batch_size = batch_size + + return out class SileroVADAnalyzer(VADAnalyzer): @@ -41,7 +118,21 @@ def __init__( logger.debug("Loading Silero VAD model...") - self._model = load_silero_vad() + model_name = 'silero_vad.onnx' + package_path = "pipecat.vad.data" + + try: + import importlib_resources as impresources + model_file_path = str(impresources.files(package_path).joinpath(model_name)) + except BaseException: + from importlib import resources as impresources + try: + with impresources.path(package_path, model_name) as f: + model_file_path = f + except BaseException: + model_file_path = str(impresources.files(package_path).joinpath(model_name)) + + self._model = SileroOnnxModel(model_file_path, force_onnx_cpu=True) self._last_reset_time = 0 @@ -59,7 +150,7 @@ def voice_confidence(self, buffer) -> float: audio_int16 = np.frombuffer(buffer, np.int16) # Divide by 32768 because we have signed 16-bit data. audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0 - new_confidence = self._model(torch.from_numpy(audio_float32), self.sample_rate).item() + new_confidence = self._model(audio_float32, self.sample_rate)[0] # We need to reset the model from time to time because it doesn't # really need all the data and memory will keep growing otherwise.