Skip to content

Commit

Permalink
Merge pull request #158 from pipecat-ai/use-pyloudnorm-loudness
Browse files Browse the repository at this point in the history
interruptions: introduce pyloudnorm to compute loudness
  • Loading branch information
aconchillo authored May 22, 2024
2 parents 34670ee + 075194c commit 6ac012a
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 35 deletions.
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,16 @@ All notable changes to **pipecat** will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
## [0.0.20] - 2024-05-22

### Added

- In order to improve interruptions we now compute a loudness level using
[pyloudnorm](https://github.com/csteinmetz1/pyloudnorm). The audio coming
WebRTC transports (e.g. Daily) have an Automatic Gain Control (AGC) algorithm
applied to the signal, however we don't do that on our local PyAudio
signals. This means that currently incoming audio from PyAudio is kind of
broken. We will fix it in future releases.

### Fixed

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"numpy~=1.26.4",
"loguru~=0.7.0",
"Pillow~=10.3.0",
"pyloudnorm~=0.1.1",
"typing-extensions~=4.11.0",
]

Expand Down
28 changes: 14 additions & 14 deletions src/pipecat/services/ai_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
VisionImageRawFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.utils.audio import calculate_audio_volume
from pipecat.utils.utils import exp_smoothing


Expand Down Expand Up @@ -96,13 +97,13 @@ class STTService(AIService):
"""STTService is a base class for speech-to-text services."""

def __init__(self,
min_rms: int = 100,
min_volume: float = 0.6,
max_silence_secs: float = 0.3,
max_buffer_secs: float = 1.5,
sample_rate: int = 16000,
num_channels: int = 1):
super().__init__()
self._min_rms = min_rms
self._min_volume = min_volume
self._max_silence_secs = max_silence_secs
self._max_buffer_secs = max_buffer_secs
self._sample_rate = sample_rate
Expand All @@ -111,7 +112,7 @@ def __init__(self,
self._silence_num_frames = 0
# Volume exponential smoothing
self._smoothing_factor = 0.5
self._prev_rms = 1 - self._smoothing_factor
self._prev_volume = 1 - self._smoothing_factor

@abstractmethod
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
Expand All @@ -126,25 +127,24 @@ def _new_wave(self):
ww.setframerate(self._sample_rate)
return (content, ww)

def _get_smoothed_volume(self, audio: bytes, prev_rms: float, factor: float) -> float:
# https://docs.python.org/3/library/array.html
audio_array = array.array('h', audio)
squares = [sample**2 for sample in audio_array]
mean = sum(squares) / len(audio_array)
rms = math.sqrt(mean)
return exp_smoothing(rms, prev_rms, factor)
def _get_smoothed_volume(
self,
frame: AudioRawFrame,
prev_volume: float,
factor: float) -> float:
volume = calculate_audio_volume(frame.audio, frame.sample_rate)
return exp_smoothing(volume, prev_volume, factor)

async def _append_audio(self, frame: AudioRawFrame):
# Try to filter out empty background noise
# (Very rudimentary approach, can be improved)
rms = self._get_smoothed_volume(frame.audio, self._prev_rms, self._smoothing_factor)
if rms >= self._min_rms:
volume = self._get_smoothed_volume(frame, self._prev_volume, self._smoothing_factor)
if volume >= self._min_volume:
# If volume is high enough, write new data to wave file
self._wave.writeframes(frame.audio)
self._silence_num_frames = 0
else:
self._silence_num_frames += frame.num_frames
self._prev_rms = rms
self._prev_volume = volume

# If buffer is not empty and we have enough data or there's been a long
# silence, transcribe the audio gathered so far.
Expand Down
33 changes: 33 additions & 0 deletions src/pipecat/utils/audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

import numpy as np
import pyloudnorm as pyln


def normalize_value(value, min_value, max_value):
normalized = (value - min_value) / (max_value - min_value)
normalized_clamped = max(0, min(1, normalized))
return normalized_clamped


def calculate_audio_volume(audio: bytes, sample_rate: int) -> float:
audio_np = np.frombuffer(audio, dtype=np.int16)
audio_float = audio_np.astype(np.float64)

block_size = audio_np.size / sample_rate
meter = pyln.Meter(sample_rate, block_size=block_size)
loudness = meter.integrated_loudness(audio_float)

# Loudness goes from -20 to 80 (more or less), where -20 is quiet and 80 is
# loud.
loudness = normalize_value(loudness, -20, 80)

return loudness


def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
return prev_value + factor * (value - prev_value)
26 changes: 6 additions & 20 deletions src/pipecat/vad/vad_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@
# SPDX-License-Identifier: BSD 2-Clause License
#

import array
import math

from abc import abstractmethod
from enum import Enum

from pydantic.main import BaseModel

from pipecat.utils.utils import exp_smoothing
from pipecat.utils.audio import calculate_audio_volume


class VADState(Enum):
Expand All @@ -26,13 +23,14 @@ class VADParams(BaseModel):
confidence: float = 0.6
start_secs: float = 0.2
stop_secs: float = 0.8
min_rms: int = 1000
min_volume: float = 0.6


class VADAnalyzer:

def __init__(self, sample_rate: int, num_channels: int, params: VADParams):
self._sample_rate = sample_rate
self._num_channels = num_channels
self._params = params
self._vad_frames = self.num_frames_required()
self._vad_frames_num_bytes = self._vad_frames * num_channels * 2
Expand All @@ -47,10 +45,6 @@ def __init__(self, sample_rate: int, num_channels: int, params: VADParams):

self._vad_buffer = b""

# Volume exponential smoothing
self._smoothing_factor = 0.5
self._prev_rms = 1 - self._smoothing_factor

@property
def sample_rate(self):
return self._sample_rate
Expand All @@ -63,14 +57,6 @@ def num_frames_required(self) -> int:
def voice_confidence(self, buffer) -> float:
pass

def _get_smoothed_volume(self, audio: bytes, prev_rms: float, factor: float) -> float:
# https://docs.python.org/3/library/array.html
audio_array = array.array('h', audio)
squares = [sample**2 for sample in audio_array]
mean = sum(squares) / len(audio_array)
rms = math.sqrt(mean)
return exp_smoothing(rms, prev_rms, factor)

def analyze_audio(self, buffer) -> VADState:
self._vad_buffer += buffer

Expand All @@ -82,10 +68,10 @@ def analyze_audio(self, buffer) -> VADState:
self._vad_buffer = self._vad_buffer[num_required_bytes:]

confidence = self.voice_confidence(audio_frames)
rms = self._get_smoothed_volume(audio_frames, self._prev_rms, self._smoothing_factor)
self._prev_rms = rms

speaking = confidence >= self._params.confidence and rms >= self._params.min_rms
volume = calculate_audio_volume(audio_frames, self._sample_rate)

speaking = confidence >= self._params.confidence and volume >= self._params.min_volume

if speaking:
match self._vad_state:
Expand Down

0 comments on commit 6ac012a

Please sign in to comment.