Merge pull request #158 from pipecat-ai/use-pyloudnorm-loudness

interruptions: introduce pyloudnorm to compute loudness
pipecat-ai · May 22, 2024 · 6ac012a · 6ac012a
2 parents 34670ee + 075194c
commit 6ac012a
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 35 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,16 @@ All notable changes to **pipecat** will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased]
+## [0.0.20] - 2024-05-22
+
+### Added
+
+- In order to improve interruptions we now compute a loudness level using
+  [pyloudnorm](https://github.com/csteinmetz1/pyloudnorm). The audio coming
+  WebRTC transports (e.g. Daily) have an Automatic Gain Control (AGC) algorithm
+  applied to the signal, however we don't do that on our local PyAudio
+  signals. This means that currently incoming audio from PyAudio is kind of
+  broken. We will fix it in future releases.
 
 ### Fixed
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
     "numpy~=1.26.4",
     "loguru~=0.7.0",
     "Pillow~=10.3.0",
+    "pyloudnorm~=0.1.1",
     "typing-extensions~=4.11.0",
 ]
 

diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
@@ -24,6 +24,7 @@
     VisionImageRawFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.utils.audio import calculate_audio_volume
 from pipecat.utils.utils import exp_smoothing
 
 
@@ -96,13 +97,13 @@ class STTService(AIService):
     """STTService is a base class for speech-to-text services."""
 
     def __init__(self,
-                 min_rms: int = 100,
+                 min_volume: float = 0.6,
                  max_silence_secs: float = 0.3,
                  max_buffer_secs: float = 1.5,
                  sample_rate: int = 16000,
                  num_channels: int = 1):
         super().__init__()
-        self._min_rms = min_rms
+        self._min_volume = min_volume
         self._max_silence_secs = max_silence_secs
         self._max_buffer_secs = max_buffer_secs
         self._sample_rate = sample_rate
@@ -111,7 +112,7 @@ def __init__(self,
         self._silence_num_frames = 0
         # Volume exponential smoothing
         self._smoothing_factor = 0.5
-        self._prev_rms = 1 - self._smoothing_factor
+        self._prev_volume = 1 - self._smoothing_factor
 
     @abstractmethod
     async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
@@ -126,25 +127,24 @@ def _new_wave(self):
         ww.setframerate(self._sample_rate)
         return (content, ww)
 
-    def _get_smoothed_volume(self, audio: bytes, prev_rms: float, factor: float) -> float:
-        # https://docs.python.org/3/library/array.html
-        audio_array = array.array('h', audio)
-        squares = [sample**2 for sample in audio_array]
-        mean = sum(squares) / len(audio_array)
-        rms = math.sqrt(mean)
-        return exp_smoothing(rms, prev_rms, factor)
+    def _get_smoothed_volume(
+            self,
+            frame: AudioRawFrame,
+            prev_volume: float,
+            factor: float) -> float:
+        volume = calculate_audio_volume(frame.audio, frame.sample_rate)
+        return exp_smoothing(volume, prev_volume, factor)
 
     async def _append_audio(self, frame: AudioRawFrame):
         # Try to filter out empty background noise
-        # (Very rudimentary approach, can be improved)
-        rms = self._get_smoothed_volume(frame.audio, self._prev_rms, self._smoothing_factor)
-        if rms >= self._min_rms:
+        volume = self._get_smoothed_volume(frame, self._prev_volume, self._smoothing_factor)
+        if volume >= self._min_volume:
             # If volume is high enough, write new data to wave file
             self._wave.writeframes(frame.audio)
             self._silence_num_frames = 0
         else:
             self._silence_num_frames += frame.num_frames
-        self._prev_rms = rms
+        self._prev_volume = volume
 
         # If buffer is not empty and we have enough data or there's been a long
         # silence, transcribe the audio gathered so far.

diff --git a/src/pipecat/utils/audio.py b/src/pipecat/utils/audio.py
@@ -0,0 +1,33 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import numpy as np
+import pyloudnorm as pyln
+
+
+def normalize_value(value, min_value, max_value):
+    normalized = (value - min_value) / (max_value - min_value)
+    normalized_clamped = max(0, min(1, normalized))
+    return normalized_clamped
+
+
+def calculate_audio_volume(audio: bytes, sample_rate: int) -> float:
+    audio_np = np.frombuffer(audio, dtype=np.int16)
+    audio_float = audio_np.astype(np.float64)
+
+    block_size = audio_np.size / sample_rate
+    meter = pyln.Meter(sample_rate, block_size=block_size)
+    loudness = meter.integrated_loudness(audio_float)
+
+    # Loudness goes from -20 to 80 (more or less), where -20 is quiet and 80 is
+    # loud.
+    loudness = normalize_value(loudness, -20, 80)
+
+    return loudness
+
+
+def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
+    return prev_value + factor * (value - prev_value)
diff --git a/src/pipecat/vad/vad_analyzer.py b/src/pipecat/vad/vad_analyzer.py
@@ -4,15 +4,12 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-import array
-import math
-
 from abc import abstractmethod
 from enum import Enum
 
 from pydantic.main import BaseModel
 
-from pipecat.utils.utils import exp_smoothing
+from pipecat.utils.audio import calculate_audio_volume
 
 
 class VADState(Enum):
@@ -26,13 +23,14 @@ class VADParams(BaseModel):
     confidence: float = 0.6
     start_secs: float = 0.2
     stop_secs: float = 0.8
-    min_rms: int = 1000
+    min_volume: float = 0.6
 
 
 class VADAnalyzer:
 
     def __init__(self, sample_rate: int, num_channels: int, params: VADParams):
         self._sample_rate = sample_rate
+        self._num_channels = num_channels
         self._params = params
         self._vad_frames = self.num_frames_required()
         self._vad_frames_num_bytes = self._vad_frames * num_channels * 2
@@ -47,10 +45,6 @@ def __init__(self, sample_rate: int, num_channels: int, params: VADParams):
 
         self._vad_buffer = b""
 
-        # Volume exponential smoothing
-        self._smoothing_factor = 0.5
-        self._prev_rms = 1 - self._smoothing_factor
-
     @property
     def sample_rate(self):
         return self._sample_rate
@@ -63,14 +57,6 @@ def num_frames_required(self) -> int:
     def voice_confidence(self, buffer) -> float:
         pass
 
-    def _get_smoothed_volume(self, audio: bytes, prev_rms: float, factor: float) -> float:
-        # https://docs.python.org/3/library/array.html
-        audio_array = array.array('h', audio)
-        squares = [sample**2 for sample in audio_array]
-        mean = sum(squares) / len(audio_array)
-        rms = math.sqrt(mean)
-        return exp_smoothing(rms, prev_rms, factor)
-
     def analyze_audio(self, buffer) -> VADState:
         self._vad_buffer += buffer
 
@@ -82,10 +68,10 @@ def analyze_audio(self, buffer) -> VADState:
         self._vad_buffer = self._vad_buffer[num_required_bytes:]
 
         confidence = self.voice_confidence(audio_frames)
-        rms = self._get_smoothed_volume(audio_frames, self._prev_rms, self._smoothing_factor)
-        self._prev_rms = rms
 
-        speaking = confidence >= self._params.confidence and rms >= self._params.min_rms
+        volume = calculate_audio_volume(audio_frames, self._sample_rate)
+
+        speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
 
         if speaking:
             match self._vad_state: