diff --git a/CHANGELOG.md b/CHANGELOG.md index 62d4bfc81..650639679 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,15 @@ All notable changes to **pipecat** will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Fixed + +- Fixed an issue in `AzureTTSService` that was causing interruptions to take + longer. Azure Speech Synthesizer returns big chunks of audio. Currently, we + don't have a way to stop a long `AudioRawFrame`, so we have to chunk longer + audio into small chunks. + ## [0.0.29] - 2024-06-07 ### Added diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index f4184f647..b441fe136 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -67,7 +67,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: if result.reason == ResultReason.SynthesizingAudioCompleted: await self.stop_ttfb_metrics() # Azure always sends a 44-byte header. Strip it off. - yield AudioRawFrame(audio=result.audio_data[44:], sample_rate=16000, num_channels=1) + audio = result.audio_data[44:] + # Chunk it to 20ms so interruptions work properly + bytes_frames = 640 + while len(audio) > 0: + chunk = audio[:bytes_frames] + yield AudioRawFrame(audio=chunk, sample_rate=16000, num_channels=1) + audio = audio[bytes_frames:] elif result.reason == ResultReason.Canceled: cancellation_details = result.cancellation_details logger.warning(f"Speech synthesis canceled: {cancellation_details.reason}")