diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bd0cec40..8798bc9aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,16 @@ All notable changes to **pipecat** will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.0.20] - 2024-05-22 + +### Added + +- In order to improve interruptions we now compute a loudness level using + [pyloudnorm](https://github.com/csteinmetz1/pyloudnorm). The audio coming + WebRTC transports (e.g. Daily) have an Automatic Gain Control (AGC) algorithm + applied to the signal, however we don't do that on our local PyAudio + signals. This means that currently incoming audio from PyAudio is kind of + broken. We will fix it in future releases. ### Fixed diff --git a/pyproject.toml b/pyproject.toml index 983a7b4c4..00fbc826b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "numpy~=1.26.4", "loguru~=0.7.0", "Pillow~=10.3.0", + "pyloudnorm~=0.1.1", "typing-extensions~=4.11.0", ] diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 52d62b7c8..f42086dad 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -24,6 +24,7 @@ VisionImageRawFrame, ) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.utils.audio import calculate_audio_volume from pipecat.utils.utils import exp_smoothing @@ -96,13 +97,13 @@ class STTService(AIService): """STTService is a base class for speech-to-text services.""" def __init__(self, - min_rms: int = 100, + min_volume: float = 0.6, max_silence_secs: float = 0.3, max_buffer_secs: float = 1.5, sample_rate: int = 16000, num_channels: int = 1): super().__init__() - self._min_rms = min_rms + self._min_volume = min_volume self._max_silence_secs = max_silence_secs self._max_buffer_secs = max_buffer_secs self._sample_rate = sample_rate @@ -111,7 +112,7 @@ def __init__(self, self._silence_num_frames = 0 # Volume exponential smoothing self._smoothing_factor = 0.5 - self._prev_rms = 1 - self._smoothing_factor + self._prev_volume = 1 - self._smoothing_factor @abstractmethod async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: @@ -126,25 +127,24 @@ def _new_wave(self): ww.setframerate(self._sample_rate) return (content, ww) - def _get_smoothed_volume(self, audio: bytes, prev_rms: float, factor: float) -> float: - # https://docs.python.org/3/library/array.html - audio_array = array.array('h', audio) - squares = [sample**2 for sample in audio_array] - mean = sum(squares) / len(audio_array) - rms = math.sqrt(mean) - return exp_smoothing(rms, prev_rms, factor) + def _get_smoothed_volume( + self, + frame: AudioRawFrame, + prev_volume: float, + factor: float) -> float: + volume = calculate_audio_volume(frame.audio, frame.sample_rate) + return exp_smoothing(volume, prev_volume, factor) async def _append_audio(self, frame: AudioRawFrame): # Try to filter out empty background noise - # (Very rudimentary approach, can be improved) - rms = self._get_smoothed_volume(frame.audio, self._prev_rms, self._smoothing_factor) - if rms >= self._min_rms: + volume = self._get_smoothed_volume(frame, self._prev_volume, self._smoothing_factor) + if volume >= self._min_volume: # If volume is high enough, write new data to wave file self._wave.writeframes(frame.audio) self._silence_num_frames = 0 else: self._silence_num_frames += frame.num_frames - self._prev_rms = rms + self._prev_volume = volume # If buffer is not empty and we have enough data or there's been a long # silence, transcribe the audio gathered so far. diff --git a/src/pipecat/utils/audio.py b/src/pipecat/utils/audio.py new file mode 100644 index 000000000..3c1118fc5 --- /dev/null +++ b/src/pipecat/utils/audio.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import numpy as np +import pyloudnorm as pyln + + +def normalize_value(value, min_value, max_value): + normalized = (value - min_value) / (max_value - min_value) + normalized_clamped = max(0, min(1, normalized)) + return normalized_clamped + + +def calculate_audio_volume(audio: bytes, sample_rate: int) -> float: + audio_np = np.frombuffer(audio, dtype=np.int16) + audio_float = audio_np.astype(np.float64) + + block_size = audio_np.size / sample_rate + meter = pyln.Meter(sample_rate, block_size=block_size) + loudness = meter.integrated_loudness(audio_float) + + # Loudness goes from -20 to 80 (more or less), where -20 is quiet and 80 is + # loud. + loudness = normalize_value(loudness, -20, 80) + + return loudness + + +def exp_smoothing(value: float, prev_value: float, factor: float) -> float: + return prev_value + factor * (value - prev_value) diff --git a/src/pipecat/vad/vad_analyzer.py b/src/pipecat/vad/vad_analyzer.py index 15f036387..d05b85f9b 100644 --- a/src/pipecat/vad/vad_analyzer.py +++ b/src/pipecat/vad/vad_analyzer.py @@ -4,15 +4,12 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import array -import math - from abc import abstractmethod from enum import Enum from pydantic.main import BaseModel -from pipecat.utils.utils import exp_smoothing +from pipecat.utils.audio import calculate_audio_volume class VADState(Enum): @@ -26,13 +23,14 @@ class VADParams(BaseModel): confidence: float = 0.6 start_secs: float = 0.2 stop_secs: float = 0.8 - min_rms: int = 1000 + min_volume: float = 0.6 class VADAnalyzer: def __init__(self, sample_rate: int, num_channels: int, params: VADParams): self._sample_rate = sample_rate + self._num_channels = num_channels self._params = params self._vad_frames = self.num_frames_required() self._vad_frames_num_bytes = self._vad_frames * num_channels * 2 @@ -47,10 +45,6 @@ def __init__(self, sample_rate: int, num_channels: int, params: VADParams): self._vad_buffer = b"" - # Volume exponential smoothing - self._smoothing_factor = 0.5 - self._prev_rms = 1 - self._smoothing_factor - @property def sample_rate(self): return self._sample_rate @@ -63,14 +57,6 @@ def num_frames_required(self) -> int: def voice_confidence(self, buffer) -> float: pass - def _get_smoothed_volume(self, audio: bytes, prev_rms: float, factor: float) -> float: - # https://docs.python.org/3/library/array.html - audio_array = array.array('h', audio) - squares = [sample**2 for sample in audio_array] - mean = sum(squares) / len(audio_array) - rms = math.sqrt(mean) - return exp_smoothing(rms, prev_rms, factor) - def analyze_audio(self, buffer) -> VADState: self._vad_buffer += buffer @@ -82,10 +68,10 @@ def analyze_audio(self, buffer) -> VADState: self._vad_buffer = self._vad_buffer[num_required_bytes:] confidence = self.voice_confidence(audio_frames) - rms = self._get_smoothed_volume(audio_frames, self._prev_rms, self._smoothing_factor) - self._prev_rms = rms - speaking = confidence >= self._params.confidence and rms >= self._params.min_rms + volume = calculate_audio_volume(audio_frames, self._sample_rate) + + speaking = confidence >= self._params.confidence and volume >= self._params.min_volume if speaking: match self._vad_state: