From fa0deededa5c27cfbc190b207009626c3c00aabc Mon Sep 17 00:00:00 2001 From: Jin Kim Date: Mon, 9 Sep 2024 10:53:23 +0900 Subject: [PATCH 01/60] Add voice options and make to use InputParams for Cartesia. --- .../07d-interruptible-cartesia.py | 4 +- .../12c-describe-video-anthropic.py | 4 +- examples/studypal/studypal.py | 4 +- src/pipecat/services/cartesia.py | 45 +++++++++++++------ 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/examples/foundational/07d-interruptible-cartesia.py b/examples/foundational/07d-interruptible-cartesia.py index 6b8bbcc5f..7bcc7476b 100644 --- a/examples/foundational/07d-interruptible-cartesia.py +++ b/examples/foundational/07d-interruptible-cartesia.py @@ -52,7 +52,9 @@ async def main(): tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man - sample_rate=44100, + params=CartesiaTTSService.InputParams( + sample_rate=44100, + ), ) llm = OpenAILLMService( diff --git a/examples/foundational/12c-describe-video-anthropic.py b/examples/foundational/12c-describe-video-anthropic.py index cc1f14c92..8531debf8 100644 --- a/examples/foundational/12c-describe-video-anthropic.py +++ b/examples/foundational/12c-describe-video-anthropic.py @@ -78,7 +78,9 @@ async def main(): tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady - sample_rate=16000, + params=CartesiaTTSService.InputParams( + sample_rate=16000, + ), ) @transport.event_handler("on_first_participant_joined") diff --git a/examples/studypal/studypal.py b/examples/studypal/studypal.py index 8adfe2954..f14bd3def 100644 --- a/examples/studypal/studypal.py +++ b/examples/studypal/studypal.py @@ -124,7 +124,9 @@ async def main(): api_key=os.getenv("CARTESIA_API_KEY"), voice_id=os.getenv("CARTESIA_VOICE_ID", "4d2fd738-3b3d-4368-957a-bb4805275bd9"), # British Narration Lady: 4d2fd738-3b3d-4368-957a-bb4805275bd9 - sample_rate=44100, + params=CartesiaTTSService.InputParams( + sample_rate=44100, + ), ) llm = OpenAILLMService( diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index e3541ccea..927da53f0 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -10,7 +10,8 @@ import asyncio import time -from typing import AsyncGenerator, Mapping +from typing import AsyncGenerator, Optional +from pydantic.main import BaseModel from pipecat.frames.frames import ( CancelFrame, @@ -61,6 +62,14 @@ def language_to_cartesia_language(language: Language) -> str | None: class CartesiaTTSService(TTSService): + class InputParams(BaseModel): + model_id: Optional[str] = "sonic-english" + encoding: Optional[str] = "pcm_s16le" + sample_rate: Optional[int] = 16000 + container: Optional[str] = "raw" + language: Optional[str] = "en" + speed: Optional[str] = None + emotion: Optional[list[str]] = [] def __init__( self, @@ -69,10 +78,7 @@ def __init__( voice_id: str, cartesia_version: str = "2024-06-10", url: str = "wss://api.cartesia.ai/tts/websocket", - model_id: str = "sonic-english", - encoding: str = "pcm_s16le", - sample_rate: int = 16000, - language: str = "en", + params: InputParams = InputParams(), **kwargs): super().__init__(**kwargs) @@ -92,13 +98,15 @@ def __init__( self._cartesia_version = cartesia_version self._url = url self._voice_id = voice_id - self._model_id = model_id + self._model_id = params.model_id self._output_format = { - "container": "raw", - "encoding": encoding, - "sample_rate": sample_rate, + "container": params.container, + "encoding": params.encoding, + "sample_rate": params.sample_rate, } - self._language = language + self._language = params.language + self._speed = params.speed + self._emotion = params.emotion self._websocket = None self._context_id = None @@ -249,15 +257,24 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() self._context_id = str(uuid.uuid4()) + voice_config = { + "mode": "id", + "id": self._voice_id + } + + if self._speed or self._emotion: + voice_config["__experimental_controls"] = {} + if self._speed: + voice_config["__experimental_controls"]["speed"] = self._speed + if self._emotion: + voice_config["__experimental_controls"]["emotion"] = self._emotion + msg = { "transcript": text + " ", "continue": True, "context_id": self._context_id, "model_id": self._model_id, - "voice": { - "mode": "id", - "id": self._voice_id - }, + "voice": voice_config, "output_format": self._output_format, "language": self._language, "add_timestamps": True, From 2da0ecbe3c7326d180125d16f32ee8f95113f847 Mon Sep 17 00:00:00 2001 From: Jin Kim Date: Wed, 18 Sep 2024 00:38:12 +0900 Subject: [PATCH 02/60] Revert "model_id" as a main argument --- src/pipecat/services/cartesia.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 25f54bf11..45e42470e 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -63,7 +63,6 @@ def language_to_cartesia_language(language: Language) -> str | None: class CartesiaTTSService(AsyncWordTTSService): class InputParams(BaseModel): - model_id: Optional[str] = "sonic-english" encoding: Optional[str] = "pcm_s16le" sample_rate: Optional[int] = 16000 container: Optional[str] = "raw" @@ -78,6 +77,7 @@ def __init__( voice_id: str, cartesia_version: str = "2024-06-10", url: str = "wss://api.cartesia.ai/tts/websocket", + model_id: str = "sonic-english", params: InputParams = InputParams(), **kwargs): # Aggregating sentences still gives cleaner-sounding results and fewer @@ -96,7 +96,7 @@ def __init__( self._cartesia_version = cartesia_version self._url = url self._voice_id = voice_id - self._model_id = params.model_id + self._model_id = model_id self._output_format = { "container": params.container, "encoding": params.encoding, From 75008d8f115ccab562a9b564e7415bdcb024a850 Mon Sep 17 00:00:00 2001 From: Jin Kim Date: Wed, 18 Sep 2024 00:51:45 +0900 Subject: [PATCH 03/60] Add speed and emotion setting method to Cartesia TTS service --- src/pipecat/services/cartesia.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 45e42470e..6cec9c06d 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -121,6 +121,14 @@ async def set_voice(self, voice: str): logger.debug(f"Switching TTS voice to: [{voice}]") self._voice_id = voice + async def set_speed(self, speed: str): + logger.debug(f"Switching TTS speed to: [{speed}]") + self._speed = speed + + async def set_emotion(self, emotion: list[str]): + logger.debug(f"Switching TTS emotion to: [{emotion}]") + self._emotion = emotion + async def set_language(self, language: Language): logger.debug(f"Switching TTS language to: [{language}]") self._language = language_to_cartesia_language(language) From 4533ed014fc4a22d3f7123c8f2ab3425ff7afb2f Mon Sep 17 00:00:00 2001 From: duyalei <> Date: Mon, 23 Sep 2024 16:34:31 +0800 Subject: [PATCH 04/60] add full-width punctuations as end of the sentence --- src/pipecat/utils/string.py | 3 ++- tests/test_ai_services.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/pipecat/utils/string.py b/src/pipecat/utils/string.py index a47db6c5c..cf9a22ad8 100644 --- a/src/pipecat/utils/string.py +++ b/src/pipecat/utils/string.py @@ -14,7 +14,8 @@ (? Date: Mon, 23 Sep 2024 08:37:29 -0400 Subject: [PATCH 05/60] Add language_code support for ElevenLabs TTS --- src/pipecat/services/elevenlabs.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py index 00a32cbfd..79d90bc58 100644 --- a/src/pipecat/services/elevenlabs.py +++ b/src/pipecat/services/elevenlabs.py @@ -72,6 +72,7 @@ def calculate_word_times( class ElevenLabsTTSService(AsyncWordTTSService): class InputParams(BaseModel): + language_code: Optional[str] = None output_format: Literal["pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100"] = "pcm_16000" optimize_streaming_latency: Optional[str] = None stability: Optional[float] = None @@ -228,6 +229,15 @@ async def _connect(self): if self._params.optimize_streaming_latency: url += f"&optimize_streaming_latency={self._params.optimize_streaming_latency}" + # language_code can only be used with the 'eleven_turbo_v2_5' model + if self._params.language_code: + if model == "eleven_turbo_v2_5": + url += f"&language_code={self._params.language_code}" + else: + logger.debug( + f"Language code [{self._params.language_code}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model." + ) + self._websocket = await websockets.connect(url) self._receive_task = self.get_event_loop().create_task(self._receive_task_handler()) self._keepalive_task = self.get_event_loop().create_task(self._keepalive_task_handler()) From 8edee8155dbf7d7a47ee491a3d1caafdcd3910d5 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Mon, 23 Sep 2024 10:07:05 -0400 Subject: [PATCH 06/60] Add input params to Azure TTS --- src/pipecat/services/azure.py | 115 ++++++++++++++++++++++------------ 1 file changed, 76 insertions(+), 39 deletions(-) diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index 24e73cd2a..efb6a6fe4 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -4,45 +4,34 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import aiohttp import asyncio import io +from typing import AsyncGenerator, Optional +import aiohttp +from loguru import logger from PIL import Image -from typing import AsyncGenerator - -from pipecat.frames.frames import ( - CancelFrame, - EndFrame, - ErrorFrame, - Frame, - StartFrame, - TTSAudioRawFrame, - TTSStartedFrame, - TTSStoppedFrame, - TranscriptionFrame, - URLImageRawFrame, -) -from pipecat.metrics.metrics import TTSUsageMetricsData -from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.ai_services import STTService, TTSService, ImageGenService +from pydantic import BaseModel + +from pipecat.frames.frames import (CancelFrame, EndFrame, ErrorFrame, Frame, + StartFrame, TranscriptionFrame, + TTSAudioRawFrame, TTSStartedFrame, + TTSStoppedFrame, URLImageRawFrame) +from pipecat.services.ai_services import (ImageGenService, STTService, + TTSService) from pipecat.services.openai import BaseOpenAILLMService from pipecat.utils.time import time_now_iso8601 -from loguru import logger - # See .env.example for Azure configuration needed try: - from openai import AsyncAzureOpenAI - from azure.cognitiveservices.speech import ( - SpeechConfig, - SpeechRecognizer, - SpeechSynthesizer, - ResultReason, - CancellationReason, - ) - from azure.cognitiveservices.speech.audio import AudioStreamFormat, PushAudioInputStream + from azure.cognitiveservices.speech import (CancellationReason, + ResultReason, SpeechConfig, + SpeechRecognizer, + SpeechSynthesizer) + from azure.cognitiveservices.speech.audio import (AudioStreamFormat, + PushAudioInputStream) from azure.cognitiveservices.speech.dialog import AudioConfig + from openai import AsyncAzureOpenAI except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( @@ -70,6 +59,17 @@ def create_client(self, api_key=None, base_url=None, **kwargs): class AzureTTSService(TTSService): + class InputParams(BaseModel): + emphasis: Optional[str] = None + language_code: Optional[str] = "en-US" + pitch: Optional[str] = None + rate: Optional[str] = "1.05" + role: Optional[str] = None + style: Optional[str] = None + style_degree: Optional[str] = None + volume: Optional[str] = None + + def __init__( self, *, @@ -77,6 +77,7 @@ def __init__( region: str, voice="en-US-SaraNeural", sample_rate: int = 16000, + params: InputParams = InputParams(), **kwargs, ): super().__init__(sample_rate=sample_rate, **kwargs) @@ -86,10 +87,55 @@ def __init__( self._voice = voice self._sample_rate = sample_rate + self._params = params def can_generate_metrics(self) -> bool: return True + def _construct_ssml(self, text: str) -> str: + ssml = ( + f"" + f"" + "" + ) + + if self._params.style: + ssml += f"" + + if self._params.emphasis: + ssml += f"" + + ssml += text + + if self._params.emphasis: + ssml += "" + + ssml += "" + + if self._params.style: + ssml += "" + + ssml += "" + + return ssml + async def set_voice(self, voice: str): logger.debug(f"Switching TTS voice to: [{voice}]") self._voice = voice @@ -99,16 +145,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() - ssml = ( - "" - f"" - "" - "" - "" - f"{text}" - " " - ) + ssml = self._construct_ssml(text) result = await asyncio.to_thread(self._speech_synthesizer.speak_ssml, (ssml)) From 49f212389305465791eab139083b2e3286a05dee Mon Sep 17 00:00:00 2001 From: Jin Kim Date: Tue, 24 Sep 2024 07:59:26 +0900 Subject: [PATCH 07/60] Apply and Fix upstream changes for Cartesia --- src/pipecat/services/cartesia.py | 113 +++++++++++++++++++------------ 1 file changed, 69 insertions(+), 44 deletions(-) diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 40475343c..f08c06dea 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -9,7 +9,7 @@ import base64 import asyncio -from typing import AsyncGenerator, Optional +from typing import AsyncGenerator, Optional, Union, List from pydantic.main import BaseModel from pipecat.frames.frames import ( @@ -67,8 +67,8 @@ class InputParams(BaseModel): sample_rate: Optional[int] = 16000 container: Optional[str] = "raw" language: Optional[str] = "en" - speed: Optional[str] = None - emotion: Optional[list[str]] = [] + speed: Optional[Union[str, float]] = "" + emotion: Optional[List[str]] = [] def __init__( self, @@ -91,13 +91,14 @@ def __init__( # can use those to generate text frames ourselves aligned with the # playout timing of the audio! super().__init__( - aggregate_sentences=True, push_text_frames=False, sample_rate=sample_rate, **kwargs + aggregate_sentences=True, push_text_frames=False, sample_rate=params.sample_rate, **kwargs ) self._api_key = api_key self._cartesia_version = cartesia_version self._url = url self._voice_id = voice_id + self._model_id = model_id self.set_model_name(model_id) self._output_format = { "container": params.container, @@ -116,6 +117,7 @@ def can_generate_metrics(self) -> bool: return True async def set_model(self, model: str): + self._model_id = model await super().set_model(model) logger.debug(f"Switching TTS model to: [{model}]") @@ -135,6 +137,31 @@ async def set_language(self, language: Language): logger.debug(f"Switching TTS language to: [{language}]") self._language = language_to_cartesia_language(language) + def _build_msg(self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True): + voice_config = { + "mode": "id", + "id": self._voice_id + } + + if self._speed or self._emotion: + voice_config["__experimental_controls"] = {} + if self._speed: + voice_config["__experimental_controls"]["speed"] = self._speed + if self._emotion: + voice_config["__experimental_controls"]["emotion"] = self._emotion + + msg = { + "transcript": text, + "continue": continue_transcript, + "context_id": self._context_id, + "model_id": self._model_name, + "voice": voice_config, + "output_format": self._output_format, + "language": self._language, + "add_timestamps": add_timestamps, + } + return json.dumps(msg) + async def start(self, frame: StartFrame): await super().start(frame) await self._connect() @@ -190,17 +217,8 @@ async def flush_audio(self): if not self._context_id or not self._websocket: return logger.trace("Flushing audio") - msg = { - "transcript": "", - "continue": False, - "context_id": self._context_id, - "model_id": self.model_name, - "voice": {"mode": "id", "id": self._voice_id}, - "output_format": self._output_format, - "language": self._language, - "add_timestamps": True, - } - await self._websocket.send(json.dumps(msg)) + msg = self._build_msg(text="", continue_transcript=False) + await self._websocket.send(msg) async def _receive_task_handler(self): try: @@ -255,30 +273,10 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() self._context_id = str(uuid.uuid4()) - voice_config = { - "mode": "id", - "id": self._voice_id - } + msg = self._build_msg(text=text) - if self._speed or self._emotion: - voice_config["__experimental_controls"] = {} - if self._speed: - voice_config["__experimental_controls"]["speed"] = self._speed - if self._emotion: - voice_config["__experimental_controls"]["emotion"] = self._emotion - - msg = { - "transcript": text + " ", - "continue": True, - "context_id": self._context_id, - "model_id": self._model_id, - "voice": voice_config, - "output_format": self._output_format, - "language": self._language, - "add_timestamps": True, - } try: - await self._get_websocket().send(json.dumps(msg)) + await self._get_websocket().send(msg) await self.start_tts_usage_metrics(text) except Exception as e: logger.error(f"{self} error sending message: {e}") @@ -292,6 +290,14 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: class CartesiaHttpTTSService(TTSService): + class InputParams(BaseModel): + encoding: Optional[str] = "pcm_s16le" + sample_rate: Optional[int] = 16000 + container: Optional[str] = "raw" + language: Optional[str] = "en" + speed: Optional[Union[str, float]] = "" + emotion: Optional[List[str]] = [] + def __init__( self, *, @@ -299,9 +305,7 @@ def __init__( voice_id: str, model_id: str = "sonic-english", base_url: str = "https://api.cartesia.ai", - encoding: str = "pcm_s16le", - sample_rate: int = 16000, - language: str = "en", + params: InputParams = InputParams(), **kwargs, ): super().__init__(**kwargs) @@ -309,12 +313,15 @@ def __init__( self._api_key = api_key self._voice_id = voice_id self._model_id = model_id + self.set_model_name(model_id) self._output_format = { - "container": "raw", - "encoding": encoding, - "sample_rate": sample_rate, + "container": params.container, + "encoding": params.encoding, + "sample_rate": params.sample_rate, } - self._language = language + self._language = params.language + self._speed = params.speed + self._emotion = params.emotion self._client = AsyncCartesia(api_key=api_key, base_url=base_url) @@ -324,11 +331,20 @@ def can_generate_metrics(self) -> bool: async def set_model(self, model: str): logger.debug(f"Switching TTS model to: [{model}]") self._model_id = model + await super().set_model(model) async def set_voice(self, voice: str): logger.debug(f"Switching TTS voice to: [{voice}]") self._voice_id = voice + async def set_speed(self, speed: str): + logger.debug(f"Switching TTS speed to: [{speed}]") + self._speed = speed + + async def set_emotion(self, emotion: list[str]): + logger.debug(f"Switching TTS emotion to: [{emotion}]") + self._emotion = emotion + async def set_language(self, language: Language): logger.debug(f"Switching TTS language to: [{language}]") self._language = language_to_cartesia_language(language) @@ -348,6 +364,14 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() try: + voice_controls = None + if self._speed or self._emotion: + voice_controls = {} + if self._speed: + voice_controls["speed"] = self._speed + if self._emotion: + voice_controls["emotion"] = self._emotion + output = await self._client.tts.sse( model_id=self._model_id, transcript=text, @@ -355,6 +379,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: output_format=self._output_format, language=self._language, stream=False, + _experimental_voice_controls=voice_controls ) await self.stop_ttfb_metrics() From 8ee9621d6614e8f5c5d9e0252411f5b9c3569149 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Mon, 23 Sep 2024 10:49:50 -0400 Subject: [PATCH 08/60] Add setter functions --- src/pipecat/services/azure.py | 89 +++++++++++++++++++++++++++++------ 1 file changed, 75 insertions(+), 14 deletions(-) diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index efb6a6fe4..41fc7598b 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -13,23 +13,32 @@ from PIL import Image from pydantic import BaseModel -from pipecat.frames.frames import (CancelFrame, EndFrame, ErrorFrame, Frame, - StartFrame, TranscriptionFrame, - TTSAudioRawFrame, TTSStartedFrame, - TTSStoppedFrame, URLImageRawFrame) -from pipecat.services.ai_services import (ImageGenService, STTService, - TTSService) +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + ErrorFrame, + Frame, + StartFrame, + TranscriptionFrame, + TTSAudioRawFrame, + TTSStartedFrame, + TTSStoppedFrame, + URLImageRawFrame, +) +from pipecat.services.ai_services import ImageGenService, STTService, TTSService from pipecat.services.openai import BaseOpenAILLMService from pipecat.utils.time import time_now_iso8601 # See .env.example for Azure configuration needed try: - from azure.cognitiveservices.speech import (CancellationReason, - ResultReason, SpeechConfig, - SpeechRecognizer, - SpeechSynthesizer) - from azure.cognitiveservices.speech.audio import (AudioStreamFormat, - PushAudioInputStream) + from azure.cognitiveservices.speech import ( + CancellationReason, + ResultReason, + SpeechConfig, + SpeechRecognizer, + SpeechSynthesizer, + ) + from azure.cognitiveservices.speech.audio import AudioStreamFormat, PushAudioInputStream from azure.cognitiveservices.speech.dialog import AudioConfig from openai import AsyncAzureOpenAI except ModuleNotFoundError as e: @@ -69,7 +78,6 @@ class InputParams(BaseModel): style_degree: Optional[str] = None volume: Optional[str] = None - def __init__( self, *, @@ -116,7 +124,7 @@ def _construct_ssml(self, text: str) -> str: prosody_attrs.append(f"pitch='{self._params.pitch}'") if self._params.volume: prosody_attrs.append(f"volume='{self._params.volume}'") - + ssml += f"" if self._params.emphasis: @@ -140,6 +148,59 @@ async def set_voice(self, voice: str): logger.debug(f"Switching TTS voice to: [{voice}]") self._voice = voice + async def set_emphasis(self, emphasis: str): + logger.debug(f"Setting TTS emphasis to: [{emphasis}]") + self._params.emphasis = emphasis + + async def set_language_code(self, language_code: str): + logger.debug(f"Setting TTS language code to: [{language_code}]") + self._params.language_code = language_code + + async def set_pitch(self, pitch: str): + logger.debug(f"Setting TTS pitch to: [{pitch}]") + self._params.pitch = pitch + + async def set_rate(self, rate: str): + logger.debug(f"Setting TTS rate to: [{rate}]") + self._params.rate = rate + + async def set_role(self, role: str): + logger.debug(f"Setting TTS role to: [{role}]") + self._params.role = role + + async def set_style(self, style: str): + logger.debug(f"Setting TTS style to: [{style}]") + self._params.style = style + + async def set_style_degree(self, style_degree: str): + logger.debug(f"Setting TTS style degree to: [{style_degree}]") + self._params.style_degree = style_degree + + async def set_volume(self, volume: str): + logger.debug(f"Setting TTS volume to: [{volume}]") + self._params.volume = volume + + async def set_params(self, **kwargs): + valid_params = { + "voice": self.set_voice, + "emphasis": self.set_emphasis, + "language_code": self.set_language_code, + "pitch": self.set_pitch, + "rate": self.set_rate, + "role": self.set_role, + "style": self.set_style, + "style_degree": self.set_style_degree, + "volume": self.set_volume, + } + + for param, value in kwargs.items(): + if param in valid_params: + await valid_params[param](value) + else: + logger.warning(f"Ignoring unknown parameter: {param}") + + logger.debug(f"Updated TTS parameters: {', '.join(kwargs.keys())}") + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") From b1e17ee34792326350c4040216bff69a50f3c4b8 Mon Sep 17 00:00:00 2001 From: mercuryyy Date: Tue, 24 Sep 2024 07:45:29 -0400 Subject: [PATCH 09/60] Fix syntax error in deepgram.py --- src/pipecat/services/deepgram.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index fab12e080..914bc2ec2 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -77,8 +77,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") base_url = self._base_url - request_url = f"{base_url}?model={self._voice}&encoding={ - self._encoding}&container=none&sample_rate={self._sample_rate}" + request_url = f"{base_url}?model={self._voice}&encoding={self._encoding}&container=none&sample_rate={self._sample_rate}" headers = {"authorization": f"token {self._api_key}"} body = {"text": text} From cb49b6a0d678662cdc2483b2497e00fbf0af28bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 23 Sep 2024 23:39:05 -0700 Subject: [PATCH 10/60] rtvi: add llm-text and tts-text server messages --- src/pipecat/processors/frameworks/rtvi.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index 820ea716c..7a35c5c06 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -242,6 +242,22 @@ class RTVILLMFunctionCallResultData(BaseModel): result: dict | str +class RTVITextMessageData(BaseModel): + text: str + + +class RTVILLMTextMessage(BaseModel): + label: Literal["rtvi-ai"] = "rtvi-ai" + type: Literal["llm-text"] = "llm-text" + data: RTVITextMessageData + + +class RTVITTSTextMessage(BaseModel): + label: Literal["rtvi-ai"] = "rtvi-ai" + type: Literal["tts-text"] = "tts-text" + data: RTVITextMessageData + + class RTVITranscriptionMessageData(BaseModel): text: str user_id: str From 08ac311971afb82fff28efa58af81814d9c052c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 24 Sep 2024 09:35:36 -0700 Subject: [PATCH 11/60] rtvi: use task to process incoming action frames --- src/pipecat/processors/frameworks/rtvi.py | 24 ++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index 7a35c5c06..bb23c9856 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -316,6 +316,11 @@ def __init__( self._registered_actions: Dict[str, RTVIAction] = {} self._registered_services: Dict[str, RTVIService] = {} + # A task to process incoming action frames. + self._action_task = self.get_event_loop().create_task(self._action_task_handler()) + self._action_queue = asyncio.Queue() + + # A task to process incoming transport messages. self._message_task = self.get_event_loop().create_task(self._message_task_handler()) self._message_queue = asyncio.Queue() @@ -401,7 +406,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): elif isinstance(frame, TransportMessageFrame): await self._message_queue.put(frame) elif isinstance(frame, RTVIActionFrame): - await self._handle_action(frame.message_id, frame.rtvi_action_run) + await self._action_queue.put(frame) # Other frames else: await self.push_frame(frame, direction) @@ -415,12 +420,16 @@ async def _start(self, frame: StartFrame): await self._maybe_send_bot_ready() async def _stop(self, frame: EndFrame): - # We need to cancel the message task handler because that one is not - # processing EndFrames. + self._action_task.cancel() + await self._action_task + self._message_task.cancel() await self._message_task async def _cancel(self, frame: CancelFrame): + self._action_task.cancel() + await self._action_task + self._message_task.cancel() await self._message_task @@ -471,6 +480,15 @@ async def _handle_bot_speaking(self, frame: Frame): if message: await self._push_transport_message(message) + async def _action_task_handler(self): + while True: + try: + frame = await self._action_queue.get() + await self._handle_action(frame.message_id, frame.rtvi_action_run) + self._action_queue.task_done() + except asyncio.CancelledError: + break + async def _message_task_handler(self): while True: try: From a483f1a083d7d97e932c680fa542c78926dba1a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 24 Sep 2024 10:48:15 -0700 Subject: [PATCH 12/60] rtvi: handle all actions from the action task --- src/pipecat/processors/frameworks/rtvi.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index bb23c9856..49095b2e4 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -521,7 +521,8 @@ async def _handle_message(self, frame: TransportMessageFrame): await self._handle_update_config(message.id, update_config) case "action": action = RTVIActionRun.model_validate(message.data) - await self._handle_action(message.id, action) + action_frame = RTVIActionFrame(message_id=message.id, rtvi_action_run=action) + await self._action_queue.put(action_frame) case "llm-function-call-result": data = RTVILLMFunctionCallResultData.model_validate(message.data) await self._handle_function_call_result(data) From 31b5667cee272651478aebc13e5ca4f5ab1bd2c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 24 Sep 2024 13:10:40 -0700 Subject: [PATCH 13/60] frames: log text with [] so we can distinguish spaces better --- src/pipecat/frames/frames.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index e4495098b..768e53e39 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -132,9 +132,7 @@ class VisionImageRawFrame(InputImageRawFrame): def __str__(self): pts = format_pts(self.pts) - return ( - f"{self.name}(pts: {pts}, text: {self.text}, size: {self.size}, format: {self.format})" - ) + return f"{self.name}(pts: {pts}, text: [{self.text}], size: {self.size}, format: {self.format})" @dataclass @@ -177,7 +175,7 @@ class TextFrame(DataFrame): def __str__(self): pts = format_pts(self.pts) - return f"{self.name}(pts: {pts}, text: {self.text})" + return f"{self.name}(pts: {pts}, text: [{self.text}])" @dataclass @@ -192,7 +190,7 @@ class TranscriptionFrame(TextFrame): language: Language | None = None def __str__(self): - return f"{self.name}(user: {self.user_id}, text: {self.text}, language: {self.language}, timestamp: {self.timestamp})" + return f"{self.name}(user: {self.user_id}, text: [{self.text}], language: {self.language}, timestamp: {self.timestamp})" @dataclass @@ -205,7 +203,7 @@ class InterimTranscriptionFrame(TextFrame): language: Language | None = None def __str__(self): - return f"{self.name}(user: {self.user_id}, text: {self.text}, language: {self.language}, timestamp: {self.timestamp})" + return f"{self.name}(user: {self.user_id}, text: [{self.text}], language: {self.language}, timestamp: {self.timestamp})" @dataclass From ee3786fe155a426843707d465b302f3419d569f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 24 Sep 2024 19:10:22 -0700 Subject: [PATCH 14/60] frames: add EndTaskFrame and CancelTaskFrame --- CHANGELOG.md | 4 ++++ src/pipecat/frames/frames.py | 21 +++++++++++++++++++++ src/pipecat/pipeline/task.py | 16 ++++++++++++++-- 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c1d571da5..fb08aab2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added `EndTaskFrame` and `CancelTaskFrame`. These are new frames that are + meant to be pushed upstream to tell the pipeline task to stop nicely or + immediately respectively. + - Added configurable LLM parameters (e.g., temperature, top_p, max_tokens, seed) for OpenAI, Anthropic, and Together AI services along with corresponding setter functions. diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 768e53e39..273aad214 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -339,6 +339,27 @@ class FatalErrorFrame(ErrorFrame): fatal: bool = field(default=True, init=False) +@dataclass +class EndTaskFrame(SystemFrame): + """This is used to notify the pipeline task that the pipeline should be + closed nicely (flushing all the queued frames) by pushing an EndFrame + downstream. + + """ + + pass + + +@dataclass +class CancelTaskFrame(SystemFrame): + """This is used to notify the pipeline task that the pipeline should be + stopped immediately by pushing a CancelFrame downstream. + + """ + + pass + + @dataclass class StopTaskFrame(SystemFrame): """Indicates that a pipeline task should be stopped but that the pipeline diff --git a/src/pipecat/pipeline/task.py b/src/pipecat/pipeline/task.py index 2b46c47c2..f79ff6f39 100644 --- a/src/pipecat/pipeline/task.py +++ b/src/pipecat/pipeline/task.py @@ -14,7 +14,9 @@ from pipecat.clocks.system_clock import SystemClock from pipecat.frames.frames import ( CancelFrame, + CancelTaskFrame, EndFrame, + EndTaskFrame, ErrorFrame, Frame, MetricsFrame, @@ -52,7 +54,13 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await self.push_frame(frame, direction) async def _handle_upstream_frame(self, frame: Frame): - if isinstance(frame, ErrorFrame): + if isinstance(frame, EndTaskFrame): + # Tell the task we should end nicely. + await self._up_queue.put(EndTaskFrame()) + elif isinstance(frame, CancelTaskFrame): + # Tell the task we should end right away. + await self._up_queue.put(CancelTaskFrame()) + elif isinstance(frame, ErrorFrame): logger.error(f"Error running app: {frame}") if frame.fatal: # Cancel all tasks downstream. @@ -165,7 +173,11 @@ async def _process_up_queue(self): while True: try: frame = await self._up_queue.get() - if isinstance(frame, StopTaskFrame): + if isinstance(frame, EndTaskFrame): + await self.queue_frame(EndFrame()) + elif isinstance(frame, CancelTaskFrame): + await self.queue_frame(CancelFrame()) + elif isinstance(frame, StopTaskFrame): await self.queue_frame(StopTaskFrame()) self._up_queue.task_done() except asyncio.CancelledError: From 1a3de0e8191a7529a474ec17f49b906d78c82623 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 24 Sep 2024 19:12:06 -0700 Subject: [PATCH 15/60] rtvi: add RTVIProcessor.handle_message() --- src/pipecat/processors/frameworks/rtvi.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index 49095b2e4..b75b65627 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -343,6 +343,9 @@ async def set_client_ready(self): self._client_ready = True await self._maybe_send_bot_ready() + async def handle_message(self, message: RTVIMessage): + await self._message_queue.put(message) + async def handle_function_call( self, function_name: str, @@ -492,20 +495,21 @@ async def _action_task_handler(self): async def _message_task_handler(self): while True: try: - frame = await self._message_queue.get() - await self._handle_message(frame) + message = await self._message_queue.get() + await self._handle_message(message) self._message_queue.task_done() except asyncio.CancelledError: break - async def _handle_message(self, frame: TransportMessageFrame): + async def _handle_transport_message(self, frame: TransportMessageFrame): try: message = RTVIMessage.model_validate(frame.message) + await self._message_queue.put(message) except ValidationError as e: - await self.send_error(f"Invalid incoming message: {e}") - logger.warning(f"Invalid incoming message: {e}") - return + await self.send_error(f"Invalid RTVI transport message: {e}") + logger.warning(f"Invalid RTVI transport message: {e}") + async def _handle_message(self, message: RTVIMessage): try: match message.type: case "client-ready": @@ -531,8 +535,8 @@ async def _handle_message(self, frame: TransportMessageFrame): await self._send_error_response(message.id, f"Unsupported type {message.type}") except ValidationError as e: - await self._send_error_response(message.id, f"Invalid incoming message: {e}") - logger.warning(f"Invalid incoming message: {e}") + await self._send_error_response(message.id, f"Invalid message: {e}") + logger.warning(f"Invalid message: {e}") except Exception as e: await self._send_error_response(message.id, f"Exception processing message: {e}") logger.warning(f"Exception processing message: {e}") From e276dcbab78907dac1df1e5ab1176da74255b4bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 24 Sep 2024 19:19:00 -0700 Subject: [PATCH 16/60] initialize task variables and add minor description --- src/pipecat/processors/frame_processor.py | 1 - src/pipecat/processors/frameworks/rtvi.py | 26 ++++++---- src/pipecat/services/ai_services.py | 1 + src/pipecat/transports/base_input.py | 13 +++-- src/pipecat/transports/base_output.py | 59 +++++++++++++++++------ src/pipecat/transports/services/daily.py | 5 ++ 6 files changed, 77 insertions(+), 28 deletions(-) diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py index 1bf42311d..b56846aa6 100644 --- a/src/pipecat/processors/frame_processor.py +++ b/src/pipecat/processors/frame_processor.py @@ -5,7 +5,6 @@ # import asyncio -import time from enum import Enum diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index 49095b2e4..994be3fad 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -420,18 +420,26 @@ async def _start(self, frame: StartFrame): await self._maybe_send_bot_ready() async def _stop(self, frame: EndFrame): - self._action_task.cancel() - await self._action_task + if self._action_task: + self._action_task.cancel() + await self._action_task + self._action_task = None - self._message_task.cancel() - await self._message_task + if self._message_task: + self._message_task.cancel() + await self._message_task + self._message_task = None async def _cancel(self, frame: CancelFrame): - self._action_task.cancel() - await self._action_task - - self._message_task.cancel() - await self._message_task + if self._action_task: + self._action_task.cancel() + await self._action_task + self._action_task = None + + if self._message_task: + self._message_task.cancel() + await self._message_task + self._message_task = None async def _push_transport_message(self, model: BaseModel, exclude_none: bool = True): frame = TransportMessageFrame( diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index cdad3de52..197067adc 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -350,6 +350,7 @@ async def _stop_words_task(self): if self._words_task: self._words_task.cancel() await self._words_task + self._words_task = None async def _words_task_handler(self): while True: diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py index 73ad3f5e3..df7babff1 100644 --- a/src/pipecat/transports/base_input.py +++ b/src/pipecat/transports/base_input.py @@ -37,6 +37,10 @@ def __init__(self, params: TransportParams, **kwargs): self._executor = ThreadPoolExecutor(max_workers=5) + # Task to process incoming audio (VAD) and push audio frames downstream + # if passthrough is enabled. + self._audio_task = None + async def start(self, frame: StartFrame): # Create audio input queue and task if needed. if self._params.audio_in_enabled or self._params.vad_enabled: @@ -45,16 +49,17 @@ async def start(self, frame: StartFrame): async def stop(self, frame: EndFrame): # Cancel and wait for the audio input task to finish. - if self._params.audio_in_enabled or self._params.vad_enabled: + if self._audio_task and (self._params.audio_in_enabled or self._params.vad_enabled): self._audio_task.cancel() await self._audio_task + self._audio_task = None async def cancel(self, frame: CancelFrame): - # Cancel all the tasks and wait for them to finish. - - if self._params.audio_in_enabled or self._params.vad_enabled: + # Cancel and wait for the audio input task to finish. + if self._audio_task and (self._params.audio_in_enabled or self._params.vad_enabled): self._audio_task.cancel() await self._audio_task + self._audio_task = None def vad_analyzer(self) -> VADAnalyzer | None: return self._params.vad_analyzer diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py index 5423b122f..941a3505a 100644 --- a/src/pipecat/transports/base_output.py +++ b/src/pipecat/transports/base_output.py @@ -47,6 +47,18 @@ def __init__(self, params: TransportParams, **kwargs): self._params = params + # Task to process incoming frames so we don't block upstream elements. + self._sink_task = None + + # Task to process incoming frames using a clock. + self._sink_clock_task = None + + # Task to write/send audio frames. + self._audio_out_task = None + + # Task to write/send image frames. + self._camera_out_task = None + # These are the images that we should send to the camera at our desired # framerate. self._camera_images = None @@ -88,36 +100,53 @@ async def stop(self, frame: EndFrame): # that EndFrame to be processed by the sink tasks. We also need to wait # for these tasks before cancelling the camera and audio tasks below # because they might be still rendering. - await self._sink_task - await self._sink_clock_task + if self._sink_task: + await self._sink_task + if self._sink_clock_task: + await self._sink_clock_task # Cancel and wait for the camera output task to finish. - if self._params.camera_out_enabled: + if self._camera_out_task and self._params.camera_out_enabled: self._camera_out_task.cancel() await self._camera_out_task + self._camera_out_task = None # Cancel and wait for the audio output task to finish. - if self._params.audio_out_enabled and self._params.audio_out_is_live: + if ( + self._audio_out_task + and self._params.audio_out_enabled + and self._params.audio_out_is_live + ): self._audio_out_task.cancel() await self._audio_out_task + self._audio_out_task = None async def cancel(self, frame: CancelFrame): # Since we are cancelling everything it doesn't matter if we cancel sink # tasks first or not. - self._sink_task.cancel() - self._sink_clock_task.cancel() - await self._sink_task - await self._sink_clock_task + if self._sink_task: + self._sink_task.cancel() + await self._sink_task + self._sink_task = None + + if self._sink_clock_task: + self._sink_clock_task.cancel() + await self._sink_clock_task + self._sink_clock_task = None # Cancel and wait for the camera output task to finish. - if self._params.camera_out_enabled: + if self._camera_out_task and self._params.camera_out_enabled: self._camera_out_task.cancel() await self._camera_out_task + self._camera_out_task = None # Cancel and wait for the audio output task to finish. - if self._params.audio_out_enabled and self._params.audio_out_is_live: + if self._audio_out_task and ( + self._params.audio_out_enabled and self._params.audio_out_is_live + ): self._audio_out_task.cancel() await self._audio_out_task + self._audio_out_task = None async def send_message(self, frame: TransportMessageFrame): pass @@ -183,11 +212,13 @@ async def _handle_interruptions(self, frame: Frame): if isinstance(frame, StartInterruptionFrame): # Stop sink tasks. - self._sink_task.cancel() - await self._sink_task + if self._sink_task: + self._sink_task.cancel() + await self._sink_task # Stop sink clock tasks. - self._sink_clock_task.cancel() - await self._sink_clock_task + if self._sink_clock_task: + self._sink_clock_task.cancel() + await self._sink_clock_task # Create sink tasks. self._create_sink_tasks() # Let's send a bot stopped speaking if we have to. diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py index 48b59d8ff..50c2ae085 100644 --- a/src/pipecat/transports/services/daily.py +++ b/src/pipecat/transports/services/daily.py @@ -575,6 +575,9 @@ def __init__(self, client: DailyTransportClient, params: DailyParams, **kwargs): self._client = client self._video_renderers = {} + + # Task that gets audio data from a device or the network and queues it + # internally to be processed. self._audio_in_task = None self._vad_analyzer: VADAnalyzer | None = params.vad_analyzer @@ -603,6 +606,7 @@ async def stop(self, frame: EndFrame): if self._audio_in_task and (self._params.audio_in_enabled or self._params.vad_enabled): self._audio_in_task.cancel() await self._audio_in_task + self._audio_in_task = None async def cancel(self, frame: CancelFrame): # Parent stop. @@ -613,6 +617,7 @@ async def cancel(self, frame: CancelFrame): if self._audio_in_task and (self._params.audio_in_enabled or self._params.vad_enabled): self._audio_in_task.cancel() await self._audio_in_task + self._audio_in_task = None async def cleanup(self): await super().cleanup() From 9461bacf0d6793661b20d7cc5367ad083188a433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 24 Sep 2024 19:24:37 -0700 Subject: [PATCH 17/60] pyproject: update fastapi to 0.115.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index aebccda2f..46345ed71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ openpipe = [ "openpipe~=4.24.0" ] playht = [ "pyht~=0.0.28" ] silero = [ "onnxruntime>=1.16.1" ] together = [ "together~=1.2.7" ] -websocket = [ "websockets~=12.0", "fastapi~=0.112.1" ] +websocket = [ "websockets~=12.0", "fastapi~=0.115.0" ] whisper = [ "faster-whisper~=1.0.3" ] xtts = [ "resampy~=0.4.3" ] From b8713666c23d19989abdfd6512838e0cefd2df0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 24 Sep 2024 19:35:05 -0700 Subject: [PATCH 18/60] processors: add AsyncGeneratorProcessor --- CHANGELOG.md | 4 +++ src/pipecat/processors/async_generator.py | 42 +++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 src/pipecat/processors/async_generator.py diff --git a/CHANGELOG.md b/CHANGELOG.md index fb08aab2e..6c4bf3c92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added `AsyncGeneratorProcessor`. This processor can be used together with a + `FrameSerializer` as an async generator. It provides a `generator()` function + that returns an `AsyncGenerator` and that yields serialized frames. + - Added `EndTaskFrame` and `CancelTaskFrame`. These are new frames that are meant to be pushed upstream to tell the pipeline task to stop nicely or immediately respectively. diff --git a/src/pipecat/processors/async_generator.py b/src/pipecat/processors/async_generator.py new file mode 100644 index 000000000..66b2a3e99 --- /dev/null +++ b/src/pipecat/processors/async_generator.py @@ -0,0 +1,42 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio + +from typing import Any, AsyncGenerator + +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + Frame, +) +from pipecat.processors.frame_processor import FrameProcessor, FrameDirection +from pipecat.serializers.base_serializer import FrameSerializer + + +class AsyncGeneratorProcessor(FrameProcessor): + def __init__(self, *, serializer: FrameSerializer, **kwargs): + super().__init__(**kwargs) + self._serializer = serializer + self._data_queue = asyncio.Queue() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if isinstance(frame, (CancelFrame, EndFrame)): + await self._data_queue.put(None) + else: + data = self._serializer.serialize(frame) + if data: + await self._data_queue.put(data) + + async def generator(self) -> AsyncGenerator[Any, None]: + running = True + while running: + data = await self._data_queue.get() + running = data is not None + if data: + yield data From 3621fceae2555ca24fac993086b19241740f780a Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Wed, 25 Sep 2024 09:19:28 -0700 Subject: [PATCH 19/60] fixes as noted by aleix --- src/pipecat/services/ai_services.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 197067adc..b32eb708d 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -259,6 +259,10 @@ def __init__( async def flush_audio(self): pass + async def say(self, text: str): + await super.say(text) + await self.flush_audio() + async def start(self, frame: StartFrame): await super().start(frame) if self._push_stop_frames: @@ -278,6 +282,11 @@ async def cancel(self, frame: CancelFrame): await self._stop_frame_task self._stop_frame_task = None + async def process_frame(self, frame: Frame, direction: FrameDirection): + super().process_frame(frame, direction) + if isinstance(frame, TTSSpeakFrame): + await self.flush_audio() + async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM): await super().push_frame(frame, direction) From 3d43ad0f4dcb9570202516dd0095a26226853aca Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Wed, 25 Sep 2024 10:59:00 -0700 Subject: [PATCH 20/60] actually save the file --- src/pipecat/services/ai_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index b32eb708d..21ef8dfca 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -283,7 +283,7 @@ async def cancel(self, frame: CancelFrame): self._stop_frame_task = None async def process_frame(self, frame: Frame, direction: FrameDirection): - super().process_frame(frame, direction) + await super().process_frame(frame, direction) if isinstance(frame, TTSSpeakFrame): await self.flush_audio() From c4e94e280eca9e6a2fcbfacae2bcba8b3448f57d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Wed, 25 Sep 2024 16:35:33 -0700 Subject: [PATCH 21/60] processors: add support for event handlers --- CHANGELOG.md | 10 ++++++++ src/pipecat/processors/frame_processor.py | 30 +++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c4bf3c92..f35978dcd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- All `FrameProcessors` can now register event handlers. + +``` +tts = SomeTTSService(...) + +@tts.event_handler("on_connected"): +async def on_connected(processor): + ... +``` + - Added `AsyncGeneratorProcessor`. This processor can be used together with a `FrameSerializer` as an async generator. It provides a `generator()` function that returns an `AsyncGenerator` and that yields serialized frames. diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py index b56846aa6..f71e066d7 100644 --- a/src/pipecat/processors/frame_processor.py +++ b/src/pipecat/processors/frame_processor.py @@ -5,6 +5,7 @@ # import asyncio +import inspect from enum import Enum @@ -48,6 +49,8 @@ def __init__( self._loop: asyncio.AbstractEventLoop = loop or asyncio.get_running_loop() self._sync = sync + self._event_handlers: dict = {} + # Clock self._clock: BaseClock | None = None @@ -169,6 +172,23 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect else: await self.__push_queue.put((frame, direction)) + def event_handler(self, event_name: str): + def decorator(handler): + self.add_event_handler(event_name, handler) + return handler + + return decorator + + def add_event_handler(self, event_name: str, handler): + if event_name not in self._event_handlers: + raise Exception(f"Event handler {event_name} not registered") + self._event_handlers[event_name].append(handler) + + def _register_event_handler(self, event_name: str): + if event_name in self._event_handlers: + raise Exception(f"Event handler {event_name} already registered") + self._event_handlers[event_name] = [] + # # Handle interruptions # @@ -212,5 +232,15 @@ async def __push_frame_task_handler(self): except asyncio.CancelledError: break + async def _call_event_handler(self, event_name: str, *args, **kwargs): + try: + for handler in self._event_handlers[event_name]: + if inspect.iscoroutinefunction(handler): + await handler(self, *args, **kwargs) + else: + handler(self, *args, **kwargs) + except Exception as e: + logger.exception(f"Exception in event handler {event_name}: {e}") + def __str__(self): return self.name From f06aa300d01b202f018265c3e880f147566b302c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Wed, 25 Sep 2024 16:35:49 -0700 Subject: [PATCH 22/60] rtvi: add on_bot_ready event --- src/pipecat/processors/frameworks/rtvi.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index 1458cc21d..930b2331d 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -324,6 +324,8 @@ def __init__( self._message_task = self.get_event_loop().create_task(self._message_task_handler()) self._message_queue = asyncio.Queue() + self._register_event_handler("on_bot_ready") + def register_action(self, action: RTVIAction): id = self._action_id(action.service, action.action) self._registered_actions[id] = action @@ -624,8 +626,9 @@ async def _handle_action(self, request_id: str | None, data: RTVIActionRun): async def _maybe_send_bot_ready(self): if self._pipeline_started and self._client_ready: - await self._send_bot_ready() await self._update_config(self._config, False) + await self._send_bot_ready() + await self._call_event_handler("on_bot_ready") async def _send_bot_ready(self): if not self._params.send_bot_ready: From 73da8c1910fc7ae37f3f1ac6bf8c50d89aabd927 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 25 Sep 2024 22:40:36 -0400 Subject: [PATCH 23/60] Improve usability of Deepgram TTS: use Deepgram client, remove aiohttp --- .../07c-interruptible-deepgram.py | 16 ++-- src/pipecat/services/deepgram.py | 85 +++++++++---------- 2 files changed, 47 insertions(+), 54 deletions(-) diff --git a/examples/foundational/07c-interruptible-deepgram.py b/examples/foundational/07c-interruptible-deepgram.py index 41bef8a47..fc33c246f 100644 --- a/examples/foundational/07c-interruptible-deepgram.py +++ b/examples/foundational/07c-interruptible-deepgram.py @@ -5,10 +5,14 @@ # import asyncio -import aiohttp import os import sys +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -22,12 +26,6 @@ from pipecat.transports.services.daily import DailyParams, DailyTransport from pipecat.vad.silero import SileroVADAnalyzer -from runner import configure - -from loguru import logger - -from dotenv import load_dotenv - load_dotenv(override=True) logger.remove(0) @@ -52,9 +50,7 @@ async def main(): stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) - tts = DeepgramTTSService( - aiohttp_session=session, api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en" - ) + tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en") llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index 914bc2ec2..6929e66e5 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -4,10 +4,11 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import aiohttp - +import asyncio from typing import AsyncGenerator +from loguru import logger + from pipecat.frames.frames import ( CancelFrame, EndFrame, @@ -15,27 +16,25 @@ Frame, InterimTranscriptionFrame, StartFrame, + TranscriptionFrame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, - TranscriptionFrame, ) from pipecat.services.ai_services import STTService, TTSService from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 -from loguru import logger - - # See .env.example for Deepgram configuration needed try: from deepgram import ( AsyncListenWebSocketClient, DeepgramClient, DeepgramClientOptions, - LiveTranscriptionEvents, LiveOptions, LiveResultResponse, + LiveTranscriptionEvents, + SpeakOptions, ) except ModuleNotFoundError as e: logger.error(f"Exception: {e}") @@ -50,9 +49,7 @@ def __init__( self, *, api_key: str, - aiohttp_session: aiohttp.ClientSession, voice: str = "aura-helios-en", - base_url: str = "https://api.deepgram.com/v1/speak", sample_rate: int = 16000, encoding: str = "linear16", **kwargs, @@ -60,11 +57,9 @@ def __init__( super().__init__(**kwargs) self._voice = voice - self._api_key = api_key - self._base_url = base_url self._sample_rate = sample_rate self._encoding = encoding - self._aiohttp_session = aiohttp_session + self._deepgram_client = DeepgramClient(api_key=api_key) def can_generate_metrics(self) -> bool: return True @@ -76,43 +71,45 @@ async def set_voice(self, voice: str): async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") - base_url = self._base_url - request_url = f"{base_url}?model={self._voice}&encoding={self._encoding}&container=none&sample_rate={self._sample_rate}" - headers = {"authorization": f"token {self._api_key}"} - body = {"text": text} + options = SpeakOptions( + model=self._voice, + encoding=self._encoding, + sample_rate=self._sample_rate, + container="none", + ) try: await self.start_ttfb_metrics() - async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r: - if r.status != 200: - response_text = await r.text() - # If we get a a "Bad Request: Input is unutterable", just print out a debug log. - # All other unsuccesful requests should emit an error frame. If not specifically - # handled by the running PipelineTask, the ErrorFrame will cancel the task. - if "unutterable" in response_text: - logger.debug(f"Unutterable text: [{text}]") - return - - logger.error( - f"{self} error getting audio (status: {r.status}, error: {response_text})" - ) - yield ErrorFrame( - f"Error getting audio (status: {r.status}, error: {response_text})" - ) - return - - await self.start_tts_usage_metrics(text) - - await self.push_frame(TTSStartedFrame()) - async for data in r.content: - await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame( - audio=data, sample_rate=self._sample_rate, num_channels=1 - ) - yield frame - await self.push_frame(TTSStoppedFrame()) + + response = await asyncio.to_thread( + self._deepgram_client.speak.v("1").stream, {"text": text}, options + ) + + await self.start_tts_usage_metrics(text) + await self.push_frame(TTSStartedFrame()) + + # The response.stream_memory is already a BytesIO object + audio_buffer = response.stream_memory + + if audio_buffer is None: + raise ValueError("No audio data received from Deepgram") + + # Read and yield the audio data in chunks + audio_buffer.seek(0) # Ensure we're at the start of the buffer + chunk_size = 8192 # Use a fixed buffer size + while True: + await self.stop_ttfb_metrics() + chunk = audio_buffer.read(chunk_size) + if not chunk: + break + frame = TTSAudioRawFrame(audio=chunk, sample_rate=self._sample_rate, num_channels=1) + yield frame + + await self.push_frame(TTSStoppedFrame()) + except Exception as e: logger.exception(f"{self} exception: {e}") + yield ErrorFrame(f"Error getting audio: {str(e)}") class DeepgramSTTService(STTService): From d05717a1bd709ecbf475bb2ff410763bcd8783cd Mon Sep 17 00:00:00 2001 From: Jin Kim Date: Thu, 26 Sep 2024 19:52:25 +0900 Subject: [PATCH 24/60] Apply Ruff formater --- src/pipecat/services/cartesia.py | 41 ++++++++++++++++---------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index f08c06dea..c1c296046 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -71,15 +71,16 @@ class InputParams(BaseModel): emotion: Optional[List[str]] = [] def __init__( - self, - *, - api_key: str, - voice_id: str, - cartesia_version: str = "2024-06-10", - url: str = "wss://api.cartesia.ai/tts/websocket", - model_id: str = "sonic-english", - params: InputParams = InputParams(), - **kwargs): + self, + *, + api_key: str, + voice_id: str, + cartesia_version: str = "2024-06-10", + url: str = "wss://api.cartesia.ai/tts/websocket", + model_id: str = "sonic-english", + params: InputParams = InputParams(), + **kwargs, + ): # Aggregating sentences still gives cleaner-sounding results and fewer # artifacts than streaming one word at a time. On average, waiting for a # full sentence should only "cost" us 15ms or so with GPT-4o or a Llama @@ -91,7 +92,10 @@ def __init__( # can use those to generate text frames ourselves aligned with the # playout timing of the audio! super().__init__( - aggregate_sentences=True, push_text_frames=False, sample_rate=params.sample_rate, **kwargs + aggregate_sentences=True, + push_text_frames=False, + sample_rate=params.sample_rate, + **kwargs, ) self._api_key = api_key @@ -137,11 +141,10 @@ async def set_language(self, language: Language): logger.debug(f"Switching TTS language to: [{language}]") self._language = language_to_cartesia_language(language) - def _build_msg(self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True): - voice_config = { - "mode": "id", - "id": self._voice_id - } + def _build_msg( + self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True + ): + voice_config = {"mode": "id", "id": self._voice_id} if self._speed or self._emotion: voice_config["__experimental_controls"] = {} @@ -236,8 +239,7 @@ async def _receive_task_handler(self): await self.add_word_timestamps([("LLMFullResponseEndFrame", 0)]) elif msg["type"] == "timestamps": await self.add_word_timestamps( - list(zip(msg["word_timestamps"]["words"], - msg["word_timestamps"]["start"])) + list(zip(msg["word_timestamps"]["words"], msg["word_timestamps"]["start"])) ) elif msg["type"] == "chunk": await self.stop_ttfb_metrics() @@ -254,8 +256,7 @@ async def _receive_task_handler(self): await self.stop_all_metrics() await self.push_error(ErrorFrame(f'{self} error: {msg["error"]}')) else: - logger.error( - f"Cartesia error, unknown message type: {msg}") + logger.error(f"Cartesia error, unknown message type: {msg}") except asyncio.CancelledError: pass except Exception as e: @@ -379,7 +380,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: output_format=self._output_format, language=self._language, stream=False, - _experimental_voice_controls=voice_controls + _experimental_voice_controls=voice_controls, ) await self.stop_ttfb_metrics() From f5e0b946c74695549f26135c8b92546fff22270d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Thu, 26 Sep 2024 09:08:37 -0700 Subject: [PATCH 25/60] services(cartesia): fix string formatting --- src/pipecat/services/cartesia.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index c1c296046..e38d56db3 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -180,8 +180,7 @@ async def cancel(self, frame: CancelFrame): async def _connect(self): try: self._websocket = await websockets.connect( - f"{self._url}?api_key={self._api_key}&cartesia_version={ - self._cartesia_version}" + f"{self._url}?api_key={self._api_key}&cartesia_version={self._cartesia_version}" ) self._receive_task = self.get_event_loop().create_task(self._receive_task_handler()) except Exception as e: From c7c709a0a79a27c542761c23f06606ec6d3dc0dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Thu, 26 Sep 2024 10:31:53 -0700 Subject: [PATCH 26/60] github: cache venv when running tests --- .github/workflows/tests.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index ce3e13494..b806efad4 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -27,6 +27,13 @@ jobs: uses: actions/setup-python@v4 with: python-version: "3.10" + - name: Cache virtual environment + uses: actions/cache@v3 + with: + # We are hashing dev-requirements.txt and test-requirements.txt which + # contain all dependencies needed to run the tests. + key: venv-${{ runner.os }}-${{ steps.setup_python.outputs.python-version}}-${{ hashFiles('dev-requirements.txt') }}-${{ hashFiles('test-requirements.txt') }} + path: .venv - name: Install system packages id: install_system_packages run: | From 6a6ea251aec1f32425d9a5fb0031119feaaceed5 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 26 Sep 2024 09:31:09 -0400 Subject: [PATCH 27/60] Add AWS Polly TTS support --- README.md | 4 +- dot-env.template | 5 + .../foundational/07m-interruptible-aws.py | 98 ++++++++++++ pyproject.toml | 1 + src/pipecat/services/aws.py | 146 ++++++++++++++++++ test-requirements.txt | 1 + 6 files changed, 252 insertions(+), 3 deletions(-) create mode 100644 examples/foundational/07m-interruptible-aws.py create mode 100644 src/pipecat/services/aws.py diff --git a/README.md b/README.md index faf0137dc..793d1f630 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ pip install "pipecat-ai[option,...]" Your project may or may not need these, so they're made available as optional requirements. Here is a list: -- **AI services**: `anthropic`, `azure`, `deepgram`, `gladia`, `google`, `fal`, `lmnt`, `moondream`, `openai`, `openpipe`, `playht`, `silero`, `whisper`, `xtts` +- **AI services**: `anthropic`, `aws`, `azure`, `deepgram`, `gladia`, `google`, `fal`, `lmnt`, `moondream`, `openai`, `openpipe`, `playht`, `silero`, `whisper`, `xtts` - **Transports**: `local`, `websocket`, `daily` ## Code examples @@ -110,7 +110,6 @@ python app.py Daily provides a prebuilt WebRTC user interface. Whilst the app is running, you can visit at `https://.daily.co/` and listen to the bot say hello! - ## WebRTC for production use WebSockets are fine for server-to-server communication or for initial development. But for production use, you’ll need client-server audio to use a protocol designed for real-time media transport. (For an explanation of the difference between WebSockets and WebRTC, see [this post.](https://www.daily.co/blog/how-to-talk-to-an-llm-with-your-voice/#webrtc)) @@ -131,7 +130,6 @@ pip install pipecat-ai[silero] The first time your run your bot with Silero, startup may take a while whilst it downloads and caches the model in the background. You can check the progress of this in the console. - ## Hacking on the framework itself _Note that you may need to set up a virtual environment before following the instructions below. For instance, you might need to run the following from the root of the repo:_ diff --git a/dot-env.template b/dot-env.template index 085e8b19d..e940b1076 100644 --- a/dot-env.template +++ b/dot-env.template @@ -1,6 +1,11 @@ # Anthropic ANTHROPIC_API_KEY=... +# AWS +AWS_SECRET_ACCESS_KEY=... +AWS_ACCESS_KEY_ID=... +AWS_REGION=... + # Azure AZURE_SPEECH_REGION=... AZURE_SPEECH_API_KEY=... diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py new file mode 100644 index 000000000..891ffd381 --- /dev/null +++ b/examples/foundational/07m-interruptible-aws.py @@ -0,0 +1,98 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import os +import sys + +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + +from pipecat.frames.frames import LLMMessagesFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) +from pipecat.services.aws import AWSTTSService +from pipecat.services.openai import OpenAILLMService +from pipecat.transports.services.daily import DailyParams, DailyTransport +from pipecat.vad.silero import SileroVADAnalyzer + +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(): + async with aiohttp.ClientSession() as session: + (room_url, token) = await configure(session) + + transport = DailyTransport( + room_url, + token, + "Respond bot", + DailyParams( + audio_out_enabled=True, + audio_out_sample_rate=16000, + transcription_enabled=True, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + ) + + tts = AWSTTSService( + api_key=os.getenv("AWS_SECRET_ACCESS_KEY"), + aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), + region=os.getenv("AWS_REGION"), + voice_id="Amy", + params=AWSTTSService.InputParams(engine="neural", language="en-GB", rate="1.05"), + ) + + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + tma_in = LLMUserResponseAggregator(messages) + tma_out = LLMAssistantResponseAggregator(messages) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) + + task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + transport.capture_participant_transcription(participant["id"]) + # Kick off the conversation. + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([LLMMessagesFrame(messages)]) + + runner = PipelineRunner() + + await runner.run(task) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index 46345ed71..8dcfd7cb0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ Website = "https://pipecat.ai" [project.optional-dependencies] anthropic = [ "anthropic~=0.34.0" ] +aws = [ "boto3~=1.35.27" ] azure = [ "azure-cognitiveservices-speech~=1.40.0" ] cartesia = [ "cartesia~=1.0.13", "websockets~=12.0" ] daily = [ "daily-python~=0.10.1" ] diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py new file mode 100644 index 000000000..dfca10131 --- /dev/null +++ b/src/pipecat/services/aws.py @@ -0,0 +1,146 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# +from typing import AsyncGenerator, Optional + +from loguru import logger +from pydantic import BaseModel + +from pipecat.frames.frames import ( + Frame, + TTSAudioRawFrame, + TTSStartedFrame, + TTSStoppedFrame, +) +from pipecat.services.ai_services import TTSService + +try: + import boto3 + from botocore.exceptions import BotoCoreError, ClientError +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error( + "In order to use Deepgram, you need to `pip install pipecat-ai[aws]`. Also, set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable." + ) + raise Exception(f"Missing module: {e}") + + +class AWSTTSService(TTSService): + class InputParams(BaseModel): + engine: Optional[str] = None + language: Optional[str] = None + pitch: Optional[str] = None + rate: Optional[str] = None + volume: Optional[str] = None + + def __init__( + self, + *, + api_key: str, + aws_access_key_id: str, + region: str, + voice_id: str = "Joanna", + sample_rate: int = 16000, + params: InputParams = InputParams(), + **kwargs, + ): + super().__init__(sample_rate=sample_rate, **kwargs) + + self._polly_client = boto3.client( + "polly", + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=api_key, + region_name=region, + ) + self._voice_id = voice_id + self._sample_rate = sample_rate + self._params = params + + def can_generate_metrics(self) -> bool: + return True + + def _construct_ssml(self, text: str) -> str: + ssml = "" + + if self._params.language: + ssml += f"" + + prosody_attrs = [] + # Prosody tags are only supported for standard and neural engines + if self._params.engine != "generative": + if self._params.rate: + prosody_attrs.append(f"rate='{self._params.rate}'") + if self._params.pitch: + prosody_attrs.append(f"pitch='{self._params.pitch}'") + if self._params.volume: + prosody_attrs.append(f"volume='{self._params.volume}'") + + if prosody_attrs: + ssml += f"" + else: + logger.warning("Prosody tags are not supported for generative engine. Ignoring.") + + ssml += text + + if prosody_attrs: + ssml += "" + + if self._params.language: + ssml += "" + + ssml += "" + + return ssml + + async def set_voice(self, voice: str): + logger.debug(f"Switching TTS voice to: [{voice}]") + self._voice_id = voice + + async def set_engine(self, engine: str): + logger.debug(f"Switching TTS engine to: [{engine}]") + self._params.engine = engine + + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: + logger.debug(f"Generating TTS: [{text}]") + + try: + await self.start_ttfb_metrics() + + # Construct the parameters dictionary + ssml = self._construct_ssml(text) + + params = { + "Text": ssml, + "TextType": "ssml", + "OutputFormat": "pcm", + "VoiceId": self._voice_id, + "Engine": self._params.engine, + "SampleRate": str(self._sample_rate), + } + + # Filter out None values + filtered_params = {k: v for k, v in params.items() if v is not None} + + response = self._polly_client.synthesize_speech(**filtered_params) + + await self.start_tts_usage_metrics(text) + + await self.push_frame(TTSStartedFrame()) + + if "AudioStream" in response: + with response["AudioStream"] as stream: + audio_data = stream.read() + chunk_size = 4096 # You can adjust this value + for i in range(0, len(audio_data), chunk_size): + chunk = audio_data[i : i + chunk_size] + if len(chunk) > 0: + await self.stop_ttfb_metrics() + frame = TTSAudioRawFrame(chunk, self._sample_rate, 1) + yield frame + + await self.push_frame(TTSStoppedFrame()) + + except (BotoCoreError, ClientError) as error: + logger.exception(f"{self} error generating TTS: {error}") diff --git a/test-requirements.txt b/test-requirements.txt index 78280b139..94c81331d 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,6 +1,7 @@ aiohttp~=3.10.3 anthropic~=0.30.0 azure-cognitiveservices-speech~=1.40.0 +boto3~=1.35.27 daily-python~=0.10.1 deepgram-sdk~=3.5.0 fal-client~=0.4.1 From 298b1514862c6a374869c051e6b44a517f2c706c Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 26 Sep 2024 13:05:39 -0400 Subject: [PATCH 28/60] Add setter methods --- src/pipecat/services/aws.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py index dfca10131..b32ab05f0 100644 --- a/src/pipecat/services/aws.py +++ b/src/pipecat/services/aws.py @@ -102,6 +102,26 @@ async def set_engine(self, engine: str): logger.debug(f"Switching TTS engine to: [{engine}]") self._params.engine = engine + async def set_language(self, language: str): + logger.debug(f"Switching TTS language to: [{language}]") + self._params.language = language + + async def set_pitch(self, pitch: str): + logger.debug(f"Switching TTS pitch to: [{pitch}]") + self._params.pitch = pitch + + async def set_rate(self, rate: str): + logger.debug(f"Switching TTS rate to: [{rate}]") + self._params.rate = rate + + async def set_volume(self, volume: str): + logger.debug(f"Switching TTS volume to: [{volume}]") + self._params.volume = volume + + async def set_params(self, params: InputParams): + logger.debug(f"Switching TTS params to: [{params}]") + self._params = params + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") From d3a477902b079388067ed41bb969de4454fae07b Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 26 Sep 2024 13:08:11 -0400 Subject: [PATCH 29/60] Add changelog entry --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f35978dcd..c7a525c82 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added AWS Polly TTS support. + +- Added InputParams to Azure TTS service. + - All `FrameProcessors` can now register event handlers. ``` From b8ece84c6ecf4bf74119371f241a4499d675b7e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Thu, 26 Sep 2024 10:39:00 -0700 Subject: [PATCH 30/60] services: super should be super() --- src/pipecat/services/ai_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 21ef8dfca..16280b024 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -260,7 +260,7 @@ async def flush_audio(self): pass async def say(self, text: str): - await super.say(text) + await super().say(text) await self.flush_audio() async def start(self, frame: StartFrame): From d323ea9e95960b17f8a541cbd616bd4272de1caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Thu, 26 Sep 2024 16:44:49 -0700 Subject: [PATCH 31/60] async_generator: keep pushing frames downstream --- src/pipecat/processors/async_generator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pipecat/processors/async_generator.py b/src/pipecat/processors/async_generator.py index 66b2a3e99..4f9bc85d0 100644 --- a/src/pipecat/processors/async_generator.py +++ b/src/pipecat/processors/async_generator.py @@ -26,6 +26,8 @@ def __init__(self, *, serializer: FrameSerializer, **kwargs): async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) + await self.push_frame(frame, direction) + if isinstance(frame, (CancelFrame, EndFrame)): await self._data_queue.put(None) else: From 706c00d89704f6797e464df9de7c5587bdffd719 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 26 Sep 2024 22:13:37 -0400 Subject: [PATCH 32/60] Code review feedback --- CHANGELOG.md | 2 +- examples/foundational/07m-interruptible-aws.py | 6 +++++- src/pipecat/services/aws.py | 8 +++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7a525c82..474f06989 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Added AWS Polly TTS support. +- Added AWS Polly TTS support and `07m-interruptible-aws.py` as an example. - Added InputParams to Azure TTS service. diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py index 891ffd381..69d4b84c1 100644 --- a/examples/foundational/07m-interruptible-aws.py +++ b/examples/foundational/07m-interruptible-aws.py @@ -22,6 +22,7 @@ LLMUserResponseAggregator, ) from pipecat.services.aws import AWSTTSService +from pipecat.services.deepgram import DeepgramSTTService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport from pipecat.vad.silero import SileroVADAnalyzer @@ -43,12 +44,14 @@ async def main(): DailyParams( audio_out_enabled=True, audio_out_sample_rate=16000, - transcription_enabled=True, vad_enabled=True, vad_analyzer=SileroVADAnalyzer(), + vad_audio_passthrough=True, ), ) + stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) + tts = AWSTTSService( api_key=os.getenv("AWS_SECRET_ACCESS_KEY"), aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), @@ -72,6 +75,7 @@ async def main(): pipeline = Pipeline( [ transport.input(), # Transport user input + stt, # STT tma_in, # User responses llm, # LLM tts, # TTS diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py index b32ab05f0..f3b2766bc 100644 --- a/src/pipecat/services/aws.py +++ b/src/pipecat/services/aws.py @@ -9,6 +9,7 @@ from pydantic import BaseModel from pipecat.frames.frames import ( + ErrorFrame, Frame, TTSAudioRawFrame, TTSStartedFrame, @@ -152,7 +153,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: if "AudioStream" in response: with response["AudioStream"] as stream: audio_data = stream.read() - chunk_size = 4096 # You can adjust this value + chunk_size = 8192 for i in range(0, len(audio_data), chunk_size): chunk = audio_data[i : i + chunk_size] if len(chunk) > 0: @@ -164,3 +165,8 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: except (BotoCoreError, ClientError) as error: logger.exception(f"{self} error generating TTS: {error}") + error_message = f"AWS Polly TTS error: {str(error)}" + yield ErrorFrame(error=error_message) + + finally: + await self.push_frame(TTSStoppedFrame()) From 2a05cd35b0b5a295126d88a2170c0fd78cad0695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Thu, 26 Sep 2024 16:45:20 -0700 Subject: [PATCH 33/60] rtvi: add multiple RTVI frame processors --- src/pipecat/processors/frameworks/rtvi.py | 281 +++++++++++++++++----- 1 file changed, 220 insertions(+), 61 deletions(-) diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index 930b2331d..03d63c7f0 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -5,6 +5,7 @@ # import asyncio +import base64 from typing import Any, Awaitable, Callable, Dict, List, Literal, Optional, Union from pydantic import BaseModel, Field, PrivateAttr, ValidationError @@ -20,8 +21,14 @@ ErrorFrame, Frame, InterimTranscriptionFrame, + LLMFullResponseEndFrame, + LLMFullResponseStartFrame, + OutputAudioRawFrame, StartFrame, SystemFrame, + TTSStartedFrame, + TTSStoppedFrame, + TextFrame, TranscriptionFrame, TransportMessageFrame, UserStartedSpeakingFrame, @@ -242,33 +249,75 @@ class RTVILLMFunctionCallResultData(BaseModel): result: dict | str +class RTVIBotLLMStartedMessage(BaseModel): + label: Literal["rtvi-ai"] = "rtvi-ai" + type: Literal["bot-llm-started"] = "bot-llm-started" + + +class RTVIBotLLMStoppedMessage(BaseModel): + label: Literal["rtvi-ai"] = "rtvi-ai" + type: Literal["bot-llm-stopped"] = "bot-llm-stopped" + + +class RTVIBotTTSStartedMessage(BaseModel): + label: Literal["rtvi-ai"] = "rtvi-ai" + type: Literal["bot-tts-started"] = "bot-tts-started" + + +class RTVIBotTTSStoppedMessage(BaseModel): + label: Literal["rtvi-ai"] = "rtvi-ai" + type: Literal["bot-tts-stopped"] = "bot-tts-stopped" + + class RTVITextMessageData(BaseModel): text: str -class RTVILLMTextMessage(BaseModel): +class RTVIBotLLMTextMessage(BaseModel): label: Literal["rtvi-ai"] = "rtvi-ai" - type: Literal["llm-text"] = "llm-text" + type: Literal["bot-llm-text"] = "bot-llm-text" data: RTVITextMessageData -class RTVITTSTextMessage(BaseModel): +class RTVIBotTTSTextMessage(BaseModel): label: Literal["rtvi-ai"] = "rtvi-ai" - type: Literal["tts-text"] = "tts-text" + type: Literal["bot-tts-text"] = "bot-tts-text" data: RTVITextMessageData -class RTVITranscriptionMessageData(BaseModel): +class RTVIAudioMessageData(BaseModel): + audio: str + sample_rate: int + num_channels: int + + +class RTVIBotAudioMessage(BaseModel): + label: Literal["rtvi-ai"] = "rtvi-ai" + type: Literal["bot-audio"] = "bot-audio" + data: RTVIAudioMessageData + + +class RTVIBotTranscriptionMessageData(BaseModel): + text: str + + +class RTVIBotTranscriptionMessage(BaseModel): + label: Literal["rtvi-ai"] = "rtvi-ai" + type: Literal["bot-transcription"] = "bot-transcription" + data: RTVIBotTranscriptionMessageData + + +class RTVIUserTranscriptionMessageData(BaseModel): text: str user_id: str timestamp: str final: bool -class RTVITranscriptionMessage(BaseModel): +class RTVIUserTranscriptionMessage(BaseModel): label: Literal["rtvi-ai"] = "rtvi-ai" type: Literal["user-transcription"] = "user-transcription" - data: RTVITranscriptionMessageData + data: RTVIUserTranscriptionMessageData class RTVIUserStartedSpeakingMessage(BaseModel): @@ -295,6 +344,170 @@ class RTVIProcessorParams(BaseModel): send_bot_ready: bool = True +class RTVIFrameProcessor(FrameProcessor): + def __init__(self, direction: FrameDirection = FrameDirection.DOWNSTREAM, **kwargs): + super().__init__(**kwargs) + self._direction = direction + + async def _push_transport_message(self, model: BaseModel, exclude_none: bool = True): + frame = TransportMessageFrame( + message=model.model_dump(exclude_none=exclude_none), urgent=True + ) + await self.push_frame(frame, self._direction) + + +class RTVISpeakingProcessor(RTVIFrameProcessor): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + await self.push_frame(frame, direction) + + if isinstance(frame, (UserStartedSpeakingFrame, UserStoppedSpeakingFrame)): + await self._handle_interruptions(frame) + elif isinstance(frame, (BotStartedSpeakingFrame, BotStoppedSpeakingFrame)): + await self._handle_bot_speaking(frame) + + async def _handle_interruptions(self, frame: Frame): + message = None + if isinstance(frame, UserStartedSpeakingFrame): + message = RTVIUserStartedSpeakingMessage() + elif isinstance(frame, UserStoppedSpeakingFrame): + message = RTVIUserStoppedSpeakingMessage() + + if message: + await self._push_transport_message(message) + + async def _handle_bot_speaking(self, frame: Frame): + message = None + if isinstance(frame, BotStartedSpeakingFrame): + message = RTVIBotStartedSpeakingMessage() + elif isinstance(frame, BotStoppedSpeakingFrame): + message = RTVIBotStoppedSpeakingMessage() + + if message: + await self._push_transport_message(message) + + +class RTVIUserTranscriptionProcessor(RTVIFrameProcessor): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + await self.push_frame(frame, direction) + + if isinstance(frame, (TranscriptionFrame, InterimTranscriptionFrame)): + await self._handle_user_transcriptions(frame) + + async def _handle_user_transcriptions(self, frame: Frame): + message = None + if isinstance(frame, TranscriptionFrame): + message = RTVIUserTranscriptionMessage( + data=RTVIUserTranscriptionMessageData( + text=frame.text, user_id=frame.user_id, timestamp=frame.timestamp, final=True + ) + ) + elif isinstance(frame, InterimTranscriptionFrame): + message = RTVIUserTranscriptionMessage( + data=RTVIUserTranscriptionMessageData( + text=frame.text, user_id=frame.user_id, timestamp=frame.timestamp, final=False + ) + ) + + if message: + await self._push_transport_message(message) + + +class RTVIBotLLMProcessor(RTVIFrameProcessor): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + await self.push_frame(frame, direction) + + if isinstance(frame, LLMFullResponseStartFrame): + await self._push_transport_message(RTVIBotLLMStartedMessage()) + elif isinstance(frame, LLMFullResponseEndFrame): + await self._push_transport_message(RTVIBotLLMStoppedMessage()) + + +class RTVIBotTTSProcessor(RTVIFrameProcessor): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + await self.push_frame(frame, direction) + + if isinstance(frame, TTSStartedFrame): + await self._push_transport_message(RTVIBotTTSStartedMessage()) + elif isinstance(frame, TTSStoppedFrame): + await self._push_transport_message(RTVIBotTTSStoppedMessage()) + + +class RTVIBotLLMTextProcessor(RTVIFrameProcessor): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + await self.push_frame(frame, direction) + + if isinstance(frame, TextFrame): + await self._handle_text(frame) + + async def _handle_text(self, frame: TextFrame): + message = RTVIBotLLMTextMessage(data=RTVITextMessageData(text=frame.text)) + await self._push_transport_message(message) + + +class RTVIBotTTSTextProcessor(RTVIFrameProcessor): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + await self.push_frame(frame, direction) + + if isinstance(frame, TextFrame): + await self._handle_text(frame) + + async def _handle_text(self, frame: TextFrame): + message = RTVIBotTTSTextMessage(data=RTVITextMessageData(text=frame.text)) + await self._push_transport_message(message) + + +class RTVIBotAudioProcessor(RTVIFrameProcessor): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + await self.push_frame(frame, direction) + + if isinstance(frame, OutputAudioRawFrame): + await self._handle_audio(frame) + + async def _handle_audio(self, frame: OutputAudioRawFrame): + encoded = base64.b64encode(frame.audio).decode("utf-8") + message = RTVIBotAudioMessage( + data=RTVIAudioMessageData( + audio=encoded, sample_rate=frame.sample_rate, num_channels=frame.num_channels + ) + ) + await self._push_transport_message(message) + + class RTVIProcessor(FrameProcessor): def __init__( self, @@ -394,20 +607,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # finish and the task finishes when EndFrame is processed. await self.push_frame(frame, direction) await self._stop(frame) - elif isinstance(frame, UserStartedSpeakingFrame) or isinstance( - frame, UserStoppedSpeakingFrame - ): - await self._handle_interruptions(frame) - await self.push_frame(frame, direction) - elif isinstance(frame, BotStartedSpeakingFrame) or isinstance( - frame, BotStoppedSpeakingFrame - ): - await self._handle_bot_speaking(frame) - await self.push_frame(frame, direction) # Data frames - elif isinstance(frame, TranscriptionFrame) or isinstance(frame, InterimTranscriptionFrame): - await self._handle_transcriptions(frame) - await self.push_frame(frame, direction) elif isinstance(frame, TransportMessageFrame): await self._message_queue.put(frame) elif isinstance(frame, RTVIActionFrame): @@ -452,47 +652,6 @@ async def _push_transport_message(self, model: BaseModel, exclude_none: bool = T ) await self.push_frame(frame) - async def _handle_transcriptions(self, frame: Frame): - # TODO(aleix): Once we add support for using custom pipelines, the STTs will - # be in the pipeline after this processor. - - message = None - if isinstance(frame, TranscriptionFrame): - message = RTVITranscriptionMessage( - data=RTVITranscriptionMessageData( - text=frame.text, user_id=frame.user_id, timestamp=frame.timestamp, final=True - ) - ) - elif isinstance(frame, InterimTranscriptionFrame): - message = RTVITranscriptionMessage( - data=RTVITranscriptionMessageData( - text=frame.text, user_id=frame.user_id, timestamp=frame.timestamp, final=False - ) - ) - - if message: - await self._push_transport_message(message) - - async def _handle_interruptions(self, frame: Frame): - message = None - if isinstance(frame, UserStartedSpeakingFrame): - message = RTVIUserStartedSpeakingMessage() - elif isinstance(frame, UserStoppedSpeakingFrame): - message = RTVIUserStoppedSpeakingMessage() - - if message: - await self._push_transport_message(message) - - async def _handle_bot_speaking(self, frame: Frame): - message = None - if isinstance(frame, BotStartedSpeakingFrame): - message = RTVIBotStartedSpeakingMessage() - elif isinstance(frame, BotStoppedSpeakingFrame): - message = RTVIBotStoppedSpeakingMessage() - - if message: - await self._push_transport_message(message) - async def _action_task_handler(self): while True: try: From 6e8a202107ad31f4d0987792128fab662321c5b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Thu, 26 Sep 2024 22:42:19 -0700 Subject: [PATCH 34/60] rtvi: fix handling transport messages --- src/pipecat/processors/frameworks/rtvi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index 03d63c7f0..9721a6613 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -609,7 +609,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await self._stop(frame) # Data frames elif isinstance(frame, TransportMessageFrame): - await self._message_queue.put(frame) + await self._handle_transport_message(frame) elif isinstance(frame, RTVIActionFrame): await self._action_queue.put(frame) # Other frames From 2c8e5665076ad9f187982a196e3cab2f1bd11b68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Thu, 26 Sep 2024 22:42:36 -0700 Subject: [PATCH 35/60] rtvi: update version to 0.2 --- src/pipecat/processors/frameworks/rtvi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index 9721a6613..f88660f60 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -41,7 +41,7 @@ from loguru import logger -RTVI_PROTOCOL_VERSION = "0.1" +RTVI_PROTOCOL_VERSION = "0.2" ActionResult = Union[bool, int, float, str, list, dict] From 830d2df671a93a8b2293ee9e4a7767fd6abc2874 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 26 Sep 2024 13:03:46 -0400 Subject: [PATCH 36/60] Add Google TTS --- CHANGELOG.md | 2 + .../foundational/07n-interruptible-google.py | 100 ++++++++ pyproject.toml | 2 +- src/pipecat/services/google.py | 217 +++++++++++++++++- test-requirements.txt | 1 + 5 files changed, 312 insertions(+), 10 deletions(-) create mode 100644 examples/foundational/07n-interruptible-google.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 474f06989..37189eb47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added Google TTS service and corresponding foundational example `07n-interruptible-google.py` + - Added AWS Polly TTS support and `07m-interruptible-aws.py` as an example. - Added InputParams to Azure TTS service. diff --git a/examples/foundational/07n-interruptible-google.py b/examples/foundational/07n-interruptible-google.py new file mode 100644 index 000000000..713b3dce3 --- /dev/null +++ b/examples/foundational/07n-interruptible-google.py @@ -0,0 +1,100 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import os +import sys + +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + +from pipecat.frames.frames import LLMMessagesFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantResponseAggregator, + LLMUserResponseAggregator, +) +from pipecat.services.deepgram import DeepgramSTTService +from pipecat.services.google import GoogleTTSService +from pipecat.services.openai import OpenAILLMService +from pipecat.transports.services.daily import DailyParams, DailyTransport +from pipecat.vad.silero import SileroVADAnalyzer + +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(): + async with aiohttp.ClientSession() as session: + (room_url, token) = await configure(session) + + transport = DailyTransport( + room_url, + token, + "Respond bot", + DailyParams( + audio_out_enabled=True, + audio_out_sample_rate=24000, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + vad_audio_passthrough=True, + ), + ) + + stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) + + tts = GoogleTTSService( + credentials=os.getenv("GOOGLE_CREDENTIALS"), + voice_id="en-US-Neural2-J", + params=GoogleTTSService.InputParams(language="en-US", rate="1.05"), + ) + + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + tma_in = LLMUserResponseAggregator(messages) + tma_out = LLMAssistantResponseAggregator(messages) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, # STT + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ] + ) + + task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + transport.capture_participant_transcription(participant["id"]) + # Kick off the conversation. + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([LLMMessagesFrame(messages)]) + + runner = PipelineRunner() + + await runner.run(task) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index 8dcfd7cb0..876242343 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,7 @@ elevenlabs = [ "websockets~=12.0" ] examples = [ "python-dotenv~=1.0.1", "flask~=3.0.3", "flask_cors~=4.0.1" ] fal = [ "fal-client~=0.4.1" ] gladia = [ "websockets~=12.0" ] -google = [ "google-generativeai~=0.7.2" ] +google = [ "google-generativeai~=0.7.2", "google-cloud-texttospeech~=2.17.2" ] gstreamer = [ "pygobject~=3.48.2" ] fireworks = [ "openai~=1.37.2" ] langchain = [ "langchain~=0.2.14", "langchain-community~=0.2.12", "langchain-openai~=0.1.20" ] diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index 4de6b77fa..2a0a7d1e1 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -5,30 +5,37 @@ # import asyncio +import json +from typing import AsyncGenerator, List, Literal, Optional -from typing import List +import numpy as np +from loguru import logger +from pydantic import BaseModel from pipecat.frames.frames import ( Frame, + LLMFullResponseEndFrame, + LLMFullResponseStartFrame, + LLMMessagesFrame, LLMModelUpdateFrame, TextFrame, + TTSAudioRawFrame, + TTSStartedFrame, + TTSStoppedFrame, VisionImageRawFrame, - LLMMessagesFrame, - LLMFullResponseStartFrame, - LLMFullResponseEndFrame, ) -from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.ai_services import LLMService from pipecat.processors.aggregators.openai_llm_context import ( OpenAILLMContext, OpenAILLMContextFrame, ) - -from loguru import logger +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.ai_services import LLMService, TTSService try: - import google.generativeai as gai import google.ai.generativelanguage as glm + import google.generativeai as gai + from google.cloud import texttospeech_v1 + from google.oauth2 import service_account except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( @@ -137,3 +144,195 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): if context: await self._process_context(context) + + +class GoogleTTSService(TTSService): + class InputParams(BaseModel): + pitch: Optional[str] = None + rate: Optional[str] = None + volume: Optional[str] = None + emphasis: Optional[Literal["strong", "moderate", "reduced", "none"]] = None + language: Optional[str] = None + gender: Optional[Literal["male", "female", "neutral"]] = None + google_style: Optional[Literal["apologetic", "calm", "empathetic", "firm", "lively"]] = None + + def __init__( + self, + *, + credentials: Optional[str] = None, + credentials_path: Optional[str] = None, + voice_id: str = "en-US-Neural2-A", + sample_rate: int = 24000, + params: InputParams = InputParams(), + **kwargs, + ): + super().__init__(sample_rate=sample_rate, **kwargs) + + self._voice_id: str = voice_id + self._params = params + self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client( + credentials, credentials_path + ) + + def _create_client( + self, credentials: Optional[str], credentials_path: Optional[str] + ) -> texttospeech_v1.TextToSpeechAsyncClient: + creds: Optional[service_account.Credentials] = None + + # Create a Google Cloud service account for the Cloud Text-to-Speech API + # Using either the provided credentials JSON string or the path to a service account JSON + # file, create a Google Cloud service account and use it to authenticate with the API. + if credentials: + # Use provided credentials JSON string + json_account_info = json.loads(credentials) + creds = service_account.Credentials.from_service_account_info(json_account_info) + elif credentials_path: + # Use service account JSON file if provided + creds = service_account.Credentials.from_service_account_file(credentials_path) + else: + raise ValueError("Either 'credentials' or 'credentials_path' must be provided.") + + return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds) + + def can_generate_metrics(self) -> bool: + return True + + def _construct_ssml(self, text: str) -> str: + ssml = "" + + # Voice tag + voice_attrs = [f"name='{self._voice_id}'"] + if self._params.language: + voice_attrs.append(f"language='{self._params.language}'") + if self._params.gender: + voice_attrs.append(f"gender='{self._params.gender}'") + ssml += f"" + + # Prosody tag + prosody_attrs = [] + if self._params.pitch: + prosody_attrs.append(f"pitch='{self._params.pitch}'") + if self._params.rate: + prosody_attrs.append(f"rate='{self._params.rate}'") + if self._params.volume: + prosody_attrs.append(f"volume='{self._params.volume}'") + + if prosody_attrs: + ssml += f"" + + # Emphasis tag + if self._params.emphasis: + ssml += f"" + + # Google style tag + if self._params.google_style: + ssml += f"" + + ssml += text + + # Close tags + if self._params.google_style: + ssml += "" + if self._params.emphasis: + ssml += "" + if prosody_attrs: + ssml += "" + ssml += "" + + return ssml + + async def set_voice(self, voice: str) -> None: + logger.debug(f"Switching TTS voice to: [{voice}]") + self._voice_id = voice + + async def set_language(self, language: str) -> None: + logger.debug(f"Switching TTS language to: [{language}]") + self._params.language = language + + async def set_pitch(self, pitch: str) -> None: + logger.debug(f"Switching TTS pitch to: [{pitch}]") + self._params.pitch = pitch + + async def set_rate(self, rate: str) -> None: + logger.debug(f"Switching TTS rate to: [{rate}]") + self._params.rate = rate + + async def set_volume(self, volume: str) -> None: + logger.debug(f"Switching TTS volume to: [{volume}]") + self._params.volume = volume + + async def set_emphasis( + self, emphasis: Literal["strong", "moderate", "reduced", "none"] + ) -> None: + logger.debug(f"Switching TTS emphasis to: [{emphasis}]") + self._params.emphasis = emphasis + + async def set_gender(self, gender: Literal["male", "female", "neutral"]) -> None: + logger.debug(f"Switch TTS gender to [{gender}]") + self._params.gender = gender + + async def google_style( + self, google_style: Literal["apologetic", "calm", "empathetic", "firm", "lively"] + ) -> None: + logger.debug(f"Switching TTS google style to: [{google_style}]") + self._params.google_style = google_style + + async def set_params(self, params: InputParams) -> None: + logger.debug(f"Switching TTS params to: [{params}]") + self._params = params + + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: + logger.debug(f"Generating TTS: [{text}]") + + try: + await self.start_ttfb_metrics() + + ssml = self._construct_ssml(text) + synthesis_input = texttospeech_v1.SynthesisInput(ssml=ssml) + voice = texttospeech_v1.VoiceSelectionParams( + language_code=self._params.language, name=self._voice_id + ) + audio_config = texttospeech_v1.AudioConfig( + audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16, + sample_rate_hertz=self.sample_rate, + ) + + request = texttospeech_v1.SynthesizeSpeechRequest( + input=synthesis_input, voice=voice, audio_config=audio_config + ) + + response = await self._client.synthesize_speech(request=request) + + await self.start_tts_usage_metrics(text) + + await self.push_frame(TTSStartedFrame()) + + # The audio produced by the TTS service has an audible click or pop at the beginning. + # This is due to the abrupt start of the audio waveform. To mitigate this, we apply a + # short fade-in effect to the audio data. + + # Convert the response to a mutable numpy array + audio_content = np.frombuffer(response.audio_content, dtype=np.int16).copy() + + # Apply a smooth, short fade-in + fade_duration = int(0.01 * self.sample_rate) # 10ms fade-in + fade_in = np.square( + np.linspace(0, 1, fade_duration) + ) # Quadratic fade for smoother start + audio_content[:fade_duration] = audio_content[:fade_duration] * fade_in + + # Read and yield audio data in chunks + chunk_size = 8192 + for i in range(0, len(audio_content), chunk_size): + chunk = audio_content[i : i + chunk_size].tobytes() + if not chunk: + break + await self.stop_ttfb_metrics() + frame = TTSAudioRawFrame(chunk, self.sample_rate, 1) + yield frame + await asyncio.sleep(0) # Allow other tasks to run + + await self.push_frame(TTSStoppedFrame()) + + except Exception as e: + logger.exception(f"{self} error generating TTS: {e}") diff --git a/test-requirements.txt b/test-requirements.txt index 94c81331d..8c7db7377 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -7,6 +7,7 @@ deepgram-sdk~=3.5.0 fal-client~=0.4.1 fastapi~=0.112.1 faster-whisper~=1.0.3 +google-cloud-texttospeech~=2.17.2 google-generativeai~=0.7.2 langchain~=0.2.14 livekit~=0.13.1 From e7548f9494cc8cd0ab27eba8dae53bb3c9f3af36 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Fri, 27 Sep 2024 08:02:44 -0400 Subject: [PATCH 37/60] Code review feedback --- src/pipecat/services/google.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index 2a0a7d1e1..38af3e41f 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -8,11 +8,11 @@ import json from typing import AsyncGenerator, List, Literal, Optional -import numpy as np from loguru import logger from pydantic import BaseModel from pipecat.frames.frames import ( + ErrorFrame, Frame, LLMFullResponseEndFrame, LLMFullResponseStartFrame, @@ -307,24 +307,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.push_frame(TTSStartedFrame()) - # The audio produced by the TTS service has an audible click or pop at the beginning. - # This is due to the abrupt start of the audio waveform. To mitigate this, we apply a - # short fade-in effect to the audio data. - - # Convert the response to a mutable numpy array - audio_content = np.frombuffer(response.audio_content, dtype=np.int16).copy() - - # Apply a smooth, short fade-in - fade_duration = int(0.01 * self.sample_rate) # 10ms fade-in - fade_in = np.square( - np.linspace(0, 1, fade_duration) - ) # Quadratic fade for smoother start - audio_content[:fade_duration] = audio_content[:fade_duration] * fade_in + # Skip the first 44 bytes to remove the WAV header + audio_content = response.audio_content[44:] # Read and yield audio data in chunks chunk_size = 8192 for i in range(0, len(audio_content), chunk_size): - chunk = audio_content[i : i + chunk_size].tobytes() + chunk = audio_content[i : i + chunk_size] if not chunk: break await self.stop_ttfb_metrics() @@ -336,3 +325,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: except Exception as e: logger.exception(f"{self} error generating TTS: {e}") + error_message = f"TTS generation error: {str(e)}" + yield ErrorFrame(error=error_message) + finally: + await self.push_frame(TTSStoppedFrame()) From 50b6580fbb44b6f686fc249ae2fd160b8f60947b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Fri, 27 Sep 2024 13:28:33 -0700 Subject: [PATCH 38/60] livekit: add license notice --- src/pipecat/transports/services/livekit.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/pipecat/transports/services/livekit.py b/src/pipecat/transports/services/livekit.py index 52bbbf89d..6e5e48d0b 100644 --- a/src/pipecat/transports/services/livekit.py +++ b/src/pipecat/transports/services/livekit.py @@ -1,10 +1,17 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import asyncio + from dataclasses import dataclass from typing import Any, Awaitable, Callable, List -import numpy as np -from loguru import logger from pydantic import BaseModel + +import numpy as np from scipy import signal from pipecat.frames.frames import ( @@ -28,6 +35,8 @@ from pipecat.transports.base_transport import BaseTransport, TransportParams from pipecat.vad.vad_analyzer import VADAnalyzer +from loguru import logger + try: from livekit import rtc from tenacity import retry, stop_after_attempt, wait_exponential From d9b16d4f738f405287bfc79fd350852c48fb829a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Fri, 27 Sep 2024 13:32:27 -0700 Subject: [PATCH 39/60] services: import cosmetics --- src/pipecat/services/aws.py | 4 +++- src/pipecat/services/azure.py | 9 ++++++--- src/pipecat/services/deepgram.py | 5 +++-- src/pipecat/services/fal.py | 3 ++- src/pipecat/services/openai.py | 9 +++++---- src/pipecat/services/xtts.py | 4 ++-- 6 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py index f3b2766bc..80240985f 100644 --- a/src/pipecat/services/aws.py +++ b/src/pipecat/services/aws.py @@ -3,9 +3,9 @@ # # SPDX-License-Identifier: BSD 2-Clause License # + from typing import AsyncGenerator, Optional -from loguru import logger from pydantic import BaseModel from pipecat.frames.frames import ( @@ -17,6 +17,8 @@ ) from pipecat.services.ai_services import TTSService +from loguru import logger + try: import boto3 from botocore.exceptions import BotoCoreError, ClientError diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index 41fc7598b..c8fa095ab 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -4,13 +4,12 @@ # SPDX-License-Identifier: BSD 2-Clause License # +import aiohttp import asyncio import io + from typing import AsyncGenerator, Optional -import aiohttp -from loguru import logger -from PIL import Image from pydantic import BaseModel from pipecat.frames.frames import ( @@ -29,6 +28,10 @@ from pipecat.services.openai import BaseOpenAILLMService from pipecat.utils.time import time_now_iso8601 +from PIL import Image + +from loguru import logger + # See .env.example for Azure configuration needed try: from azure.cognitiveservices.speech import ( diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index 6929e66e5..d109cce3c 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -5,9 +5,8 @@ # import asyncio -from typing import AsyncGenerator -from loguru import logger +from typing import AsyncGenerator from pipecat.frames.frames import ( CancelFrame, @@ -25,6 +24,8 @@ from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 +from loguru import logger + # See .env.example for Deepgram configuration needed try: from deepgram import ( diff --git a/src/pipecat/services/fal.py b/src/pipecat/services/fal.py index bb7b47dfc..aecdeb709 100644 --- a/src/pipecat/services/fal.py +++ b/src/pipecat/services/fal.py @@ -8,13 +8,14 @@ import io import os -from PIL import Image from pydantic import BaseModel from typing import AsyncGenerator, Optional, Union, Dict from pipecat.frames.frames import ErrorFrame, Frame, URLImageRawFrame from pipecat.services.ai_services import ImageGenService +from PIL import Image + from loguru import logger try: diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index e54898525..47bee5ec1 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -9,14 +9,12 @@ import io import json import httpx + from dataclasses import dataclass from typing import Any, AsyncGenerator, Dict, List, Literal, Optional from pydantic import BaseModel, Field -from loguru import logger -from PIL import Image - from pipecat.frames.frames import ( ErrorFrame, Frame, @@ -39,7 +37,6 @@ LLMUserContextAggregator, LLMAssistantContextAggregator, ) - from pipecat.processors.aggregators.openai_llm_context import ( OpenAILLMContext, OpenAILLMContextFrame, @@ -47,6 +44,10 @@ from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import ImageGenService, LLMService, TTSService +from PIL import Image + +from loguru import logger + try: from openai import AsyncOpenAI, AsyncStream, DefaultAsyncHttpxClient, BadRequestError, NOT_GIVEN from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py index 5161efcf6..2c47d59e8 100644 --- a/src/pipecat/services/xtts.py +++ b/src/pipecat/services/xtts.py @@ -18,10 +18,10 @@ ) from pipecat.services.ai_services import TTSService -from loguru import logger - import numpy as np +from loguru import logger + try: import resampy except ModuleNotFoundError as e: From 44a349386c135df45beb07c32b82f6e17747c2c7 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Fri, 27 Sep 2024 10:43:49 -0400 Subject: [PATCH 40/60] Consolidate update frames classes into a single UpdateSettingsFrame class --- CHANGELOG.md | 3 + src/pipecat/frames/frames.py | 121 +++++----------------------- src/pipecat/services/ai_services.py | 38 +++++---- src/pipecat/services/anthropic.py | 63 +++++++++------ src/pipecat/services/google.py | 9 ++- src/pipecat/services/openai.py | 54 ++++++++----- src/pipecat/services/together.py | 62 ++++++++------ 7 files changed, 157 insertions(+), 193 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37189eb47..0f489556c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -93,6 +93,9 @@ async def on_connected(processor): ### Changed +- Updated individual update settings frame classes into a single UpdateSettingsFrame + class for STT, LLM, and TTS. + - We now distinguish between input and output audio and image frames. We introduce `InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame` and `OutputImageRawFrame` (and other subclasses of those). The input frames diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 273aad214..1b31b9c88 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -4,9 +4,8 @@ # SPDX-License-Identifier: BSD 2-Clause License # -from typing import Any, List, Optional, Tuple - from dataclasses import dataclass, field +from typing import Any, List, Optional, Tuple from pipecat.clocks.base_clock import BaseClock from pipecat.metrics.metrics import MetricsData @@ -528,113 +527,35 @@ def __str__(self): @dataclass -class LLMModelUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM model.""" - - model: str - - -@dataclass -class LLMTemperatureUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM temperature.""" - - temperature: float - - -@dataclass -class LLMTopKUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM top_k.""" - - top_k: int - - -@dataclass -class LLMTopPUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM top_p.""" - - top_p: float - - -@dataclass -class LLMFrequencyPenaltyUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM frequency - penalty. - - """ - - frequency_penalty: float - - -@dataclass -class LLMPresencePenaltyUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM presence - penalty. +class LLMUpdateSettingsFrame(ControlFrame): + """A control frame containing a request to update LLM settings.""" - """ - - presence_penalty: float + model: Optional[str] = None + temperature: Optional[float] = None + top_k: Optional[int] = None + top_p: Optional[float] = None + frequency_penalty: Optional[float] = None + presence_penalty: Optional[float] = None + max_tokens: Optional[int] = None + seed: Optional[int] = None + extra: dict = field(default_factory=dict) @dataclass -class LLMMaxTokensUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM max tokens.""" - - max_tokens: int - - -@dataclass -class LLMSeedUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM seed.""" - - seed: int - - -@dataclass -class LLMExtraUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new LLM extra params.""" - - extra: dict - - -@dataclass -class TTSModelUpdateFrame(ControlFrame): - """A control frame containing a request to update the TTS model.""" - - model: str - - -@dataclass -class TTSVoiceUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new TTS voice.""" - - voice: str - - -@dataclass -class TTSLanguageUpdateFrame(ControlFrame): - """A control frame containing a request to update to a new TTS language and - optional voice. - - """ - - language: Language - - -@dataclass -class STTModelUpdateFrame(ControlFrame): - """A control frame containing a request to update the STT model and optional - language. - - """ +class TTSUpdateSettingsFrame(ControlFrame): + """A control frame containing a request to update TTS settings.""" - model: str + model: Optional[str] = None + voice: Optional[str] = None + language: Optional[Language] = None @dataclass -class STTLanguageUpdateFrame(ControlFrame): - """A control frame containing a request to update to STT language.""" +class STTUpdateSettingsFrame(ControlFrame): + """A control frame containing a request to update STT settings.""" - language: Language + model: Optional[str] = None + language: Optional[Language] = None @dataclass diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 16280b024..79e52531d 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -7,10 +7,11 @@ import asyncio import io import wave - from abc import abstractmethod from typing import AsyncGenerator, List, Optional, Tuple +from loguru import logger + from pipecat.frames.frames import ( AudioRawFrame, CancelFrame, @@ -18,31 +19,26 @@ ErrorFrame, Frame, LLMFullResponseEndFrame, - STTLanguageUpdateFrame, - STTModelUpdateFrame, StartFrame, StartInterruptionFrame, + STTUpdateSettingsFrame, + TextFrame, TTSAudioRawFrame, - TTSLanguageUpdateFrame, - TTSModelUpdateFrame, TTSSpeakFrame, TTSStartedFrame, TTSStoppedFrame, - TTSVoiceUpdateFrame, - TextFrame, + TTSUpdateSettingsFrame, UserImageRequestFrame, VisionImageRawFrame, ) from pipecat.metrics.metrics import MetricsData +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.transcriptions.language import Language from pipecat.utils.audio import calculate_audio_volume from pipecat.utils.string import match_endofsentence from pipecat.utils.time import seconds_to_nanoseconds from pipecat.utils.utils import exp_smoothing -from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext - -from loguru import logger class AIService(FrameProcessor): @@ -230,12 +226,13 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await self.push_frame(frame, direction) elif isinstance(frame, TTSSpeakFrame): await self._push_tts_frames(frame.text) - elif isinstance(frame, TTSModelUpdateFrame): - await self.set_model(frame.model) - elif isinstance(frame, TTSVoiceUpdateFrame): - await self.set_voice(frame.voice) - elif isinstance(frame, TTSLanguageUpdateFrame): - await self.set_language(frame.language) + elif isinstance(frame, TTSUpdateSettingsFrame): + if frame.model is not None: + await self.set_model(frame.model) + if frame.voice is not None: + await self.set_voice(frame.voice) + if frame.language is not None: + await self.set_language(frame.language) else: await self.push_frame(frame, direction) @@ -408,10 +405,11 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # In this service we accumulate audio internally and at the end we # push a TextFrame. We don't really want to push audio frames down. await self.process_audio_frame(frame) - elif isinstance(frame, STTModelUpdateFrame): - await self.set_model(frame.model) - elif isinstance(frame, STTLanguageUpdateFrame): - await self.set_language(frame.language) + elif isinstance(frame, STTUpdateSettingsFrame): + if frame.model is not None: + await self.set_model(frame.model) + if frame.language is not None: + await self.set_language(frame.language) else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py index 8b8e187ea..1c4cd284e 100644 --- a/src/pipecat/services/anthropic.py +++ b/src/pipecat/services/anthropic.py @@ -5,47 +5,47 @@ # import base64 -import json -import io import copy -from typing import Any, Dict, List, Optional +import io +import json +import re +from asyncio import CancelledError from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +from loguru import logger from PIL import Image -from asyncio import CancelledError -import re from pydantic import BaseModel, Field from pipecat.frames.frames import ( Frame, + FunctionCallInProgressFrame, + FunctionCallResultFrame, LLMEnablePromptCachingFrame, - LLMModelUpdateFrame, - TextFrame, - VisionImageRawFrame, - UserImageRequestFrame, - UserImageRawFrame, - LLMMessagesFrame, - LLMFullResponseStartFrame, LLMFullResponseEndFrame, - FunctionCallResultFrame, - FunctionCallInProgressFrame, + LLMFullResponseStartFrame, + LLMMessagesFrame, + LLMUpdateSettingsFrame, StartInterruptionFrame, + TextFrame, + UserImageRawFrame, + UserImageRequestFrame, + VisionImageRawFrame, ) from pipecat.metrics.metrics import LLMTokenUsage -from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.ai_services import LLMService +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantContextAggregator, + LLMUserContextAggregator, +) from pipecat.processors.aggregators.openai_llm_context import ( OpenAILLMContext, OpenAILLMContextFrame, ) -from pipecat.processors.aggregators.llm_response import ( - LLMUserContextAggregator, - LLMAssistantContextAggregator, -) - -from loguru import logger +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.ai_services import LLMService try: - from anthropic import AsyncAnthropic, NOT_GIVEN, NotGiven + from anthropic import NOT_GIVEN, AsyncAnthropic, NotGiven except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( @@ -293,9 +293,20 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # UserImageRawFrames coming through the pipeline and add them # to the context. context = AnthropicLLMContext.from_image_frame(frame) - elif isinstance(frame, LLMModelUpdateFrame): - logger.debug(f"Switching LLM model to: [{frame.model}]") - self.set_model_name(frame.model) + elif isinstance(frame, LLMUpdateSettingsFrame): + if frame.model is not None: + logger.debug(f"Switching LLM model to: [{frame.model}]") + self.set_model_name(frame.model) + if frame.max_tokens is not None: + await self.set_max_tokens(frame.max_tokens) + if frame.temperature is not None: + await self.set_temperature(frame.temperature) + if frame.top_k is not None: + await self.set_top_k(frame.top_k) + if frame.top_p is not None: + await self.set_top_p(frame.top_p) + if frame.extra: + await self.set_extra(frame.extra) elif isinstance(frame, LLMEnablePromptCachingFrame): logger.debug(f"Setting enable prompt caching to: [{frame.enable}]") self._enable_prompt_caching_beta = frame.enable diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index 38af3e41f..53efd8c17 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -17,7 +17,7 @@ LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesFrame, - LLMModelUpdateFrame, + LLMUpdateSettingsFrame, TextFrame, TTSAudioRawFrame, TTSStartedFrame, @@ -136,9 +136,10 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): context = OpenAILLMContext.from_messages(frame.messages) elif isinstance(frame, VisionImageRawFrame): context = OpenAILLMContext.from_image_frame(frame) - elif isinstance(frame, LLMModelUpdateFrame): - logger.debug(f"Switching LLM model to: [{frame.model}]") - self._create_client(frame.model) + elif isinstance(frame, LLMUpdateSettingsFrame): + if frame.model is not None: + logger.debug(f"Switching LLM model to: [{frame.model}]") + self.set_model_name(frame.model) else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index 47bee5ec1..a830b65a8 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -4,38 +4,39 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import aiohttp import base64 import io import json -import httpx - from dataclasses import dataclass - from typing import Any, AsyncGenerator, Dict, List, Literal, Optional + +import aiohttp +import httpx +from loguru import logger +from PIL import Image from pydantic import BaseModel, Field from pipecat.frames.frames import ( ErrorFrame, Frame, + FunctionCallInProgressFrame, + FunctionCallResultFrame, LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesFrame, - LLMModelUpdateFrame, + LLMUpdateSettingsFrame, + StartInterruptionFrame, + TextFrame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, - TextFrame, URLImageRawFrame, VisionImageRawFrame, - FunctionCallResultFrame, - FunctionCallInProgressFrame, - StartInterruptionFrame, ) from pipecat.metrics.metrics import LLMTokenUsage from pipecat.processors.aggregators.llm_response import ( - LLMUserContextAggregator, LLMAssistantContextAggregator, + LLMUserContextAggregator, ) from pipecat.processors.aggregators.openai_llm_context import ( OpenAILLMContext, @@ -44,12 +45,14 @@ from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import ImageGenService, LLMService, TTSService -from PIL import Image - -from loguru import logger - try: - from openai import AsyncOpenAI, AsyncStream, DefaultAsyncHttpxClient, BadRequestError, NOT_GIVEN + from openai import ( + NOT_GIVEN, + AsyncOpenAI, + AsyncStream, + BadRequestError, + DefaultAsyncHttpxClient, + ) from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam except ModuleNotFoundError as e: logger.error(f"Exception: {e}") @@ -280,9 +283,22 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): context = OpenAILLMContext.from_messages(frame.messages) elif isinstance(frame, VisionImageRawFrame): context = OpenAILLMContext.from_image_frame(frame) - elif isinstance(frame, LLMModelUpdateFrame): - logger.debug(f"Switching LLM model to: [{frame.model}]") - self.set_model_name(frame.model) + elif isinstance(frame, LLMUpdateSettingsFrame): + if frame.model is not None: + logger.debug(f"Switching LLM model to: [{frame.model}]") + self.set_model_name(frame.model) + if frame.frequency_penalty is not None: + await self.set_frequency_penalty(frame.frequency_penalty) + if frame.presence_penalty is not None: + await self.set_presence_penalty(frame.presence_penalty) + if frame.seed is not None: + await self.set_seed(frame.seed) + if frame.temperature is not None: + await self.set_temperature(frame.temperature) + if frame.top_p is not None: + await self.set_top_p(frame.top_p) + if frame.extra: + await self.set_extra(frame.extra) else: await self.push_frame(frame, direction) @@ -464,7 +480,7 @@ async def process_frame(self, frame, direction): await self._push_aggregation() else: logger.warning( - f"FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id" + "FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id" ) self._function_call_in_progress = None self._function_call_result = None diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py index b1365bc69..981aa6de2 100644 --- a/src/pipecat/services/together.py +++ b/src/pipecat/services/together.py @@ -7,37 +7,36 @@ import json import re import uuid -from pydantic import BaseModel, Field - -from typing import Any, Dict, List, Optional -from dataclasses import dataclass from asyncio import CancelledError +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +from loguru import logger +from pydantic import BaseModel, Field from pipecat.frames.frames import ( Frame, - LLMModelUpdateFrame, - TextFrame, - UserImageRequestFrame, - LLMMessagesFrame, - LLMFullResponseStartFrame, - LLMFullResponseEndFrame, - FunctionCallResultFrame, FunctionCallInProgressFrame, + FunctionCallResultFrame, + LLMFullResponseEndFrame, + LLMFullResponseStartFrame, + LLMMessagesFrame, + LLMUpdateSettingsFrame, StartInterruptionFrame, + TextFrame, + UserImageRequestFrame, ) from pipecat.metrics.metrics import LLMTokenUsage -from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.ai_services import LLMService +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantContextAggregator, + LLMUserContextAggregator, +) from pipecat.processors.aggregators.openai_llm_context import ( OpenAILLMContext, OpenAILLMContextFrame, ) -from pipecat.processors.aggregators.llm_response import ( - LLMUserContextAggregator, - LLMAssistantContextAggregator, -) - -from loguru import logger +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.ai_services import LLMService try: from together import AsyncTogether @@ -188,7 +187,7 @@ async def _process_context(self, context: OpenAILLMContext): if chunk.choices[0].finish_reason == "eos" and accumulating_function_call: await self._extract_function_call(context, function_call_accumulator) - except CancelledError as e: + except CancelledError: # todo: implement token counting estimates for use when the user interrupts a long generation # we do this in the anthropic.py service raise @@ -206,9 +205,24 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): context = frame.context elif isinstance(frame, LLMMessagesFrame): context = TogetherLLMContext.from_messages(frame.messages) - elif isinstance(frame, LLMModelUpdateFrame): - logger.debug(f"Switching LLM model to: [{frame.model}]") - self.set_model_name(frame.model) + elif isinstance(frame, LLMUpdateSettingsFrame): + if frame.model is not None: + logger.debug(f"Switching LLM model to: [{frame.model}]") + self.set_model_name(frame.model) + if frame.frequency_penalty is not None: + await self.set_frequency_penalty(frame.frequency_penalty) + if frame.max_tokens is not None: + await self.set_max_tokens(frame.max_tokens) + if frame.presence_penalty is not None: + await self.set_presence_penalty(frame.presence_penalty) + if frame.temperature is not None: + await self.set_temperature(frame.temperature) + if frame.top_k is not None: + await self.set_top_k(frame.top_k) + if frame.top_p is not None: + await self.set_top_p(frame.top_p) + if frame.extra: + await self.set_extra(frame.extra) else: await self.push_frame(frame, direction) @@ -338,7 +352,7 @@ async def process_frame(self, frame, direction): await self._push_aggregation() else: logger.warning( - f"FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id" + "FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id" ) self._function_call_in_progress = None self._function_call_result = None From 7fe118ce639aeac6bd84c7c2e721f3cf6dd298e0 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Fri, 27 Sep 2024 11:22:03 -0400 Subject: [PATCH 41/60] Align use of language param across TTS services --- src/pipecat/services/azure.py | 17 ++++++++++------- src/pipecat/services/elevenlabs.py | 10 +++++----- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index c8fa095ab..a1349cefe 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -41,7 +41,10 @@ SpeechRecognizer, SpeechSynthesizer, ) - from azure.cognitiveservices.speech.audio import AudioStreamFormat, PushAudioInputStream + from azure.cognitiveservices.speech.audio import ( + AudioStreamFormat, + PushAudioInputStream, + ) from azure.cognitiveservices.speech.dialog import AudioConfig from openai import AsyncAzureOpenAI except ModuleNotFoundError as e: @@ -73,7 +76,7 @@ def create_client(self, api_key=None, base_url=None, **kwargs): class AzureTTSService(TTSService): class InputParams(BaseModel): emphasis: Optional[str] = None - language_code: Optional[str] = "en-US" + language: Optional[str] = "en-US" pitch: Optional[str] = None rate: Optional[str] = "1.05" role: Optional[str] = None @@ -105,7 +108,7 @@ def can_generate_metrics(self) -> bool: def _construct_ssml(self, text: str) -> str: ssml = ( - f"" f"" @@ -155,9 +158,9 @@ async def set_emphasis(self, emphasis: str): logger.debug(f"Setting TTS emphasis to: [{emphasis}]") self._params.emphasis = emphasis - async def set_language_code(self, language_code: str): - logger.debug(f"Setting TTS language code to: [{language_code}]") - self._params.language_code = language_code + async def set_language(self, language: str): + logger.debug(f"Setting TTS language code to: [{language}]") + self._params.language = language async def set_pitch(self, pitch: str): logger.debug(f"Setting TTS pitch to: [{pitch}]") @@ -187,7 +190,7 @@ async def set_params(self, **kwargs): valid_params = { "voice": self.set_voice, "emphasis": self.set_emphasis, - "language_code": self.set_language_code, + "language_code": self.set_language, "pitch": self.set_pitch, "rate": self.set_rate, "role": self.set_role, diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py index 79d90bc58..ca4713f5f 100644 --- a/src/pipecat/services/elevenlabs.py +++ b/src/pipecat/services/elevenlabs.py @@ -72,7 +72,7 @@ def calculate_word_times( class ElevenLabsTTSService(AsyncWordTTSService): class InputParams(BaseModel): - language_code: Optional[str] = None + language: Optional[str] = None output_format: Literal["pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100"] = "pcm_16000" optimize_streaming_latency: Optional[str] = None stability: Optional[float] = None @@ -229,13 +229,13 @@ async def _connect(self): if self._params.optimize_streaming_latency: url += f"&optimize_streaming_latency={self._params.optimize_streaming_latency}" - # language_code can only be used with the 'eleven_turbo_v2_5' model - if self._params.language_code: + # language can only be used with the 'eleven_turbo_v2_5' model + if self._params.language: if model == "eleven_turbo_v2_5": - url += f"&language_code={self._params.language_code}" + url += f"&language_code={self._params.language}" else: logger.debug( - f"Language code [{self._params.language_code}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model." + f"Language code [{self._params.language}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model." ) self._websocket = await websockets.connect(url) From d7555609fd752e27ea34879356613d50d4055e8b Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Fri, 27 Sep 2024 11:57:50 -0400 Subject: [PATCH 42/60] Add TTS update settings options --- src/pipecat/frames/frames.py | 12 +++++- src/pipecat/services/ai_services.py | 62 ++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 1b31b9c88..8059b904b 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -5,7 +5,7 @@ # from dataclasses import dataclass, field -from typing import Any, List, Optional, Tuple +from typing import Any, List, Optional, Tuple, Union from pipecat.clocks.base_clock import BaseClock from pipecat.metrics.metrics import MetricsData @@ -548,6 +548,16 @@ class TTSUpdateSettingsFrame(ControlFrame): model: Optional[str] = None voice: Optional[str] = None language: Optional[Language] = None + speed: Optional[Union[str, float]] = None + emotion: Optional[List[str]] = None + engine: Optional[str] = None + pitch: Optional[str] = None + rate: Optional[str] = None + volume: Optional[str] = None + emphasis: Optional[str] = None + style: Optional[str] = None + style_degree: Optional[str] = None + role: Optional[str] = None @dataclass diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 79e52531d..1cb91d6a2 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -8,7 +8,7 @@ import io import wave from abc import abstractmethod -from typing import AsyncGenerator, List, Optional, Tuple +from typing import AsyncGenerator, List, Optional, Tuple, Union from loguru import logger @@ -170,6 +170,46 @@ async def set_voice(self, voice: str): async def set_language(self, language: Language): pass + @abstractmethod + async def set_speed(self, speed: Union[str, float]): + pass + + @abstractmethod + async def set_emotion(self, emotion: List[str]): + pass + + @abstractmethod + async def set_engine(self, engine: str): + pass + + @abstractmethod + async def set_pitch(self, pitch: str): + pass + + @abstractmethod + async def set_rate(self, rate: str): + pass + + @abstractmethod + async def set_volume(self, volume: str): + pass + + @abstractmethod + async def set_emphasis(self, emphasis: str): + pass + + @abstractmethod + async def set_style(self, style: str): + pass + + @abstractmethod + async def set_style_degree(self, style_degree: str): + pass + + @abstractmethod + async def set_role(self, role: str): + pass + # Converts the text to audio. @abstractmethod async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: @@ -233,6 +273,26 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await self.set_voice(frame.voice) if frame.language is not None: await self.set_language(frame.language) + if frame.speed is not None: + await self.set_speed(frame.speed) + if frame.emotion is not None: + await self.set_emotion(frame.emotion) + if frame.engine is not None: + await self.set_engine(frame.engine) + if frame.pitch is not None: + await self.set_pitch(frame.pitch) + if frame.rate is not None: + await self.set_rate(frame.rate) + if frame.volume is not None: + await self.set_volume(frame.volume) + if frame.emphasis is not None: + await self.set_emphasis(frame.emphasis) + if frame.style is not None: + await self.set_style(frame.style) + if frame.style_degree is not None: + await self.set_style_degree(frame.style_degree) + if frame.role is not None: + await self.set_role(frame.role) else: await self.push_frame(frame, direction) From 1f77863aef18c1f81d181d05e03924cbdf855e2a Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Mon, 30 Sep 2024 12:45:32 -0400 Subject: [PATCH 43/60] Code review feedback --- src/pipecat/services/ai_services.py | 66 ++++++++++++++++------------- src/pipecat/services/anthropic.py | 29 +++++++------ src/pipecat/services/openai.py | 33 ++++++++------- src/pipecat/services/together.py | 37 ++++++++-------- 4 files changed, 90 insertions(+), 75 deletions(-) diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 1cb91d6a2..ba78b24f8 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -248,6 +248,34 @@ async def _push_tts_frames(self, text: str): # interrupted, the text is not added to the assistant context. await self.push_frame(TextFrame(text)) + async def _update_tts_settings(self, frame: TTSUpdateSettingsFrame): + if frame.model is not None: + await self.set_model(frame.model) + if frame.voice is not None: + await self.set_voice(frame.voice) + if frame.language is not None: + await self.set_language(frame.language) + if frame.speed is not None: + await self.set_speed(frame.speed) + if frame.emotion is not None: + await self.set_emotion(frame.emotion) + if frame.engine is not None: + await self.set_engine(frame.engine) + if frame.pitch is not None: + await self.set_pitch(frame.pitch) + if frame.rate is not None: + await self.set_rate(frame.rate) + if frame.volume is not None: + await self.set_volume(frame.volume) + if frame.emphasis is not None: + await self.set_emphasis(frame.emphasis) + if frame.style is not None: + await self.set_style(frame.style) + if frame.style_degree is not None: + await self.set_style_degree(frame.style_degree) + if frame.role is not None: + await self.set_role(frame.role) + async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -267,32 +295,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): elif isinstance(frame, TTSSpeakFrame): await self._push_tts_frames(frame.text) elif isinstance(frame, TTSUpdateSettingsFrame): - if frame.model is not None: - await self.set_model(frame.model) - if frame.voice is not None: - await self.set_voice(frame.voice) - if frame.language is not None: - await self.set_language(frame.language) - if frame.speed is not None: - await self.set_speed(frame.speed) - if frame.emotion is not None: - await self.set_emotion(frame.emotion) - if frame.engine is not None: - await self.set_engine(frame.engine) - if frame.pitch is not None: - await self.set_pitch(frame.pitch) - if frame.rate is not None: - await self.set_rate(frame.rate) - if frame.volume is not None: - await self.set_volume(frame.volume) - if frame.emphasis is not None: - await self.set_emphasis(frame.emphasis) - if frame.style is not None: - await self.set_style(frame.style) - if frame.style_degree is not None: - await self.set_style_degree(frame.style_degree) - if frame.role is not None: - await self.set_role(frame.role) + await self._update_tts_settings(frame) else: await self.push_frame(frame, direction) @@ -454,6 +457,12 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: """Returns transcript as a string""" pass + async def _update_stt_settings(self, frame: STTUpdateSettingsFrame): + if frame.model is not None: + await self.set_model(frame.model) + if frame.language is not None: + await self.set_language(frame.language) + async def process_audio_frame(self, frame: AudioRawFrame): await self.process_generator(self.run_stt(frame.audio)) @@ -466,10 +475,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # push a TextFrame. We don't really want to push audio frames down. await self.process_audio_frame(frame) elif isinstance(frame, STTUpdateSettingsFrame): - if frame.model is not None: - await self.set_model(frame.model) - if frame.language is not None: - await self.set_language(frame.language) + await self._update_stt_settings(frame) else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py index 1c4cd284e..bc91e4e16 100644 --- a/src/pipecat/services/anthropic.py +++ b/src/pipecat/services/anthropic.py @@ -279,6 +279,21 @@ async def _process_context(self, context: OpenAILLMContext): cache_read_input_tokens=cache_read_input_tokens, ) + async def _update_settings(self, frame: LLMUpdateSettingsFrame): + if frame.model is not None: + logger.debug(f"Switching LLM model to: [{frame.model}]") + self.set_model_name(frame.model) + if frame.max_tokens is not None: + await self.set_max_tokens(frame.max_tokens) + if frame.temperature is not None: + await self.set_temperature(frame.temperature) + if frame.top_k is not None: + await self.set_top_k(frame.top_k) + if frame.top_p is not None: + await self.set_top_p(frame.top_p) + if frame.extra: + await self.set_extra(frame.extra) + async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -294,19 +309,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # to the context. context = AnthropicLLMContext.from_image_frame(frame) elif isinstance(frame, LLMUpdateSettingsFrame): - if frame.model is not None: - logger.debug(f"Switching LLM model to: [{frame.model}]") - self.set_model_name(frame.model) - if frame.max_tokens is not None: - await self.set_max_tokens(frame.max_tokens) - if frame.temperature is not None: - await self.set_temperature(frame.temperature) - if frame.top_k is not None: - await self.set_top_k(frame.top_k) - if frame.top_p is not None: - await self.set_top_p(frame.top_p) - if frame.extra: - await self.set_extra(frame.extra) + await self._update_settings(frame) elif isinstance(frame, LLMEnablePromptCachingFrame): logger.debug(f"Setting enable prompt caching to: [{frame.enable}]") self._enable_prompt_caching_beta = frame.enable diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index a830b65a8..f0892b9ca 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -273,6 +273,23 @@ async def _handle_function_call(self, context, tool_call_id, function_name, argu arguments=arguments, ) + async def _update_settings(self, frame: LLMUpdateSettingsFrame): + if frame.model is not None: + logger.debug(f"Switching LLM model to: [{frame.model}]") + self.set_model_name(frame.model) + if frame.frequency_penalty is not None: + await self.set_frequency_penalty(frame.frequency_penalty) + if frame.presence_penalty is not None: + await self.set_presence_penalty(frame.presence_penalty) + if frame.seed is not None: + await self.set_seed(frame.seed) + if frame.temperature is not None: + await self.set_temperature(frame.temperature) + if frame.top_p is not None: + await self.set_top_p(frame.top_p) + if frame.extra: + await self.set_extra(frame.extra) + async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -284,21 +301,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): elif isinstance(frame, VisionImageRawFrame): context = OpenAILLMContext.from_image_frame(frame) elif isinstance(frame, LLMUpdateSettingsFrame): - if frame.model is not None: - logger.debug(f"Switching LLM model to: [{frame.model}]") - self.set_model_name(frame.model) - if frame.frequency_penalty is not None: - await self.set_frequency_penalty(frame.frequency_penalty) - if frame.presence_penalty is not None: - await self.set_presence_penalty(frame.presence_penalty) - if frame.seed is not None: - await self.set_seed(frame.seed) - if frame.temperature is not None: - await self.set_temperature(frame.temperature) - if frame.top_p is not None: - await self.set_top_p(frame.top_p) - if frame.extra: - await self.set_extra(frame.extra) + await self._update_settings(frame) else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py index 981aa6de2..e4068ecfc 100644 --- a/src/pipecat/services/together.py +++ b/src/pipecat/services/together.py @@ -128,6 +128,25 @@ async def set_extra(self, extra: Dict[str, Any]): logger.debug(f"Switching LLM extra to: [{extra}]") self._extra = extra + async def _update_settings(self, frame: LLMUpdateSettingsFrame): + if frame.model is not None: + logger.debug(f"Switching LLM model to: [{frame.model}]") + self.set_model_name(frame.model) + if frame.frequency_penalty is not None: + await self.set_frequency_penalty(frame.frequency_penalty) + if frame.max_tokens is not None: + await self.set_max_tokens(frame.max_tokens) + if frame.presence_penalty is not None: + await self.set_presence_penalty(frame.presence_penalty) + if frame.temperature is not None: + await self.set_temperature(frame.temperature) + if frame.top_k is not None: + await self.set_top_k(frame.top_k) + if frame.top_p is not None: + await self.set_top_p(frame.top_p) + if frame.extra: + await self.set_extra(frame.extra) + async def _process_context(self, context: OpenAILLMContext): try: await self.push_frame(LLMFullResponseStartFrame()) @@ -206,23 +225,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): elif isinstance(frame, LLMMessagesFrame): context = TogetherLLMContext.from_messages(frame.messages) elif isinstance(frame, LLMUpdateSettingsFrame): - if frame.model is not None: - logger.debug(f"Switching LLM model to: [{frame.model}]") - self.set_model_name(frame.model) - if frame.frequency_penalty is not None: - await self.set_frequency_penalty(frame.frequency_penalty) - if frame.max_tokens is not None: - await self.set_max_tokens(frame.max_tokens) - if frame.presence_penalty is not None: - await self.set_presence_penalty(frame.presence_penalty) - if frame.temperature is not None: - await self.set_temperature(frame.temperature) - if frame.top_k is not None: - await self.set_top_k(frame.top_k) - if frame.top_p is not None: - await self.set_top_p(frame.top_p) - if frame.extra: - await self.set_extra(frame.extra) + await self._update_settings(frame) else: await self.push_frame(frame, direction) From ed49cebf2c32ec522f2dc9d592ca63e103382ef9 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Mon, 30 Sep 2024 15:16:46 -0400 Subject: [PATCH 44/60] Set Google TTS default language to en-US --- src/pipecat/services/google.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index 53efd8c17..519f47028 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -153,7 +153,7 @@ class InputParams(BaseModel): rate: Optional[str] = None volume: Optional[str] = None emphasis: Optional[Literal["strong", "moderate", "reduced", "none"]] = None - language: Optional[str] = None + language: Optional[str] = "en-US" gender: Optional[Literal["male", "female", "neutral"]] = None google_style: Optional[Literal["apologetic", "calm", "empathetic", "firm", "lively"]] = None From 6b7f924af606bffd006a7e239af5be281ae9f83d Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Mon, 30 Sep 2024 14:33:08 -0700 Subject: [PATCH 45/60] tts sentence aggregation fix --- .../07a-interruptible-anthropic.py | 27 ++++++++----------- src/pipecat/services/ai_services.py | 12 +++++---- src/pipecat/utils/string.py | 6 ++--- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/examples/foundational/07a-interruptible-anthropic.py b/examples/foundational/07a-interruptible-anthropic.py index 2bded2480..288cb1b31 100644 --- a/examples/foundational/07a-interruptible-anthropic.py +++ b/examples/foundational/07a-interruptible-anthropic.py @@ -5,29 +5,24 @@ # import asyncio -import aiohttp import os import sys +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, - LLMUserResponseAggregator, -) -from pipecat.services.cartesia import CartesiaTTSService +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.services.anthropic import AnthropicLLMService +from pipecat.services.cartesia import CartesiaTTSService from pipecat.transports.services.daily import DailyParams, DailyTransport from pipecat.vad.silero import SileroVADAnalyzer -from runner import configure - -from loguru import logger - -from dotenv import load_dotenv - load_dotenv(override=True) logger.remove(0) @@ -69,17 +64,17 @@ async def main(): }, ] - tma_in = LLMUserResponseAggregator(messages) - tma_out = LLMAssistantResponseAggregator(messages) + context = OpenAILLMContext(messages) + context_aggregator = llm.create_context_aggregator(context) pipeline = Pipeline( [ transport.input(), # Transport user input - tma_in, # User responses + context_aggregator.user(), # User responses llm, # LLM tts, # TTS transport.output(), # Transport bot output - tma_out, # Assistant spoken responses + context_aggregator.assistant(), # Assistant spoken responses ] ) diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index ba78b24f8..8386fccd5 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -228,16 +228,18 @@ async def _process_text_frame(self, frame: TextFrame): text = frame.text else: self._current_sentence += frame.text - if match_endofsentence(self._current_sentence): - text = self._current_sentence - self._current_sentence = "" + eos_end_marker = match_endofsentence(self._current_sentence) + if eos_end_marker: + text = self._current_sentence[:eos_end_marker] + self._current_sentence = self._current_sentence[eos_end_marker:] if text: await self._push_tts_frames(text) async def _push_tts_frames(self, text: str): - text = text.strip() - if not text: + # Don't send only whitespace. This causes problems for some TTS models. But also don't + # strip all whitespace, as whitespace can influence prosody. + if not text.strip(): return await self.start_processing_metrics() diff --git a/src/pipecat/utils/string.py b/src/pipecat/utils/string.py index cf9a22ad8..936764345 100644 --- a/src/pipecat/utils/string.py +++ b/src/pipecat/utils/string.py @@ -6,7 +6,6 @@ import re - ENDOFSENTENCE_PATTERN_STR = r""" (? bool: - return ENDOFSENTENCE_PATTERN.search(text.rstrip()) is not None +def match_endofsentence(text: str) -> int: + match = ENDOFSENTENCE_PATTERN.search(text.rstrip()) + return match.end() if match else 0 From c8995b82e56d3d293057e12a7a46143e4becf71b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 30 Sep 2024 10:05:56 -0700 Subject: [PATCH 46/60] all frame processors are asynchrnous In this commit we make all frame processors asynchronous, that is, they have an internal queue and they push frames using a task from that queue. --- CHANGELOG.md | 22 +-- .../foundational/05-sync-speech-and-image.py | 10 +- .../05a-local-sync-speech-and-image.py | 8 +- src/pipecat/processors/frame_processor.py | 22 +-- src/pipecat/processors/frameworks/rtvi.py | 2 +- .../processors/gstreamer/pipeline_source.py | 2 +- .../processors/idle_frame_processor.py | 2 +- src/pipecat/processors/user_idle_processor.py | 2 +- src/pipecat/services/ai_services.py | 155 ++++++++---------- src/pipecat/services/cartesia.py | 4 +- src/pipecat/services/elevenlabs.py | 4 +- src/pipecat/services/gladia.py | 2 +- src/pipecat/services/lmnt.py | 6 +- src/pipecat/transports/base_input.py | 2 +- src/pipecat/transports/base_output.py | 2 +- 15 files changed, 113 insertions(+), 132 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f489556c..b59ed56c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,15 +48,10 @@ async def on_connected(processor): frames. To achieve that, each frame processor should only output frames from a single task. - In this version we introduce synchronous and asynchronous frame - processors. The synchronous processors push output frames from the same task - that they receive input frames, and therefore only pushing frames from one - task. Asynchronous frame processors can have internal tasks to perform things - asynchronously (e.g. receiving data from a websocket) but they also have a - single task where they push frames from. - - By default, frame processors are synchronous. To change a frame processor to - asynchronous you only need to pass `sync=False` to the base class constructor. + In this version all the frame processors have their own task to push + frames. That is, when `push_frame()` is called the given frame will be put + into an internal queue (with the exception of system frames) and a frame + processor task will push it out. - Added pipeline clocks. A pipeline clock is used by the output transport to know when a frame needs to be presented. For that, all frames now have an @@ -68,9 +63,7 @@ async def on_connected(processor): `SystemClock`). This clock will be passed to each frame processor via the `StartFrame`. -- Added `CartesiaHttpTTSService`. This is a synchronous frame processor - (i.e. given an input text frame it will wait for the whole output before - returning). +- Added `CartesiaHttpTTSService`. - `DailyTransport` now supports setting the audio bitrate to improve audio quality through the `DailyParams.audio_out_bitrate` parameter. The new @@ -110,8 +103,9 @@ async def on_connected(processor): pipelines to be executed concurrently. The difference between a `SyncParallelPipeline` and a `ParallelPipeline` is that, given an input frame, the `SyncParallelPipeline` will wait for all the internal pipelines to - complete. This is achieved by ensuring all the processors in each of the - internal pipelines are synchronous. + complete. This is achieved by making sure the last processor in each of the + pipelines is synchronous (e.g. an HTTP-based service that waits for the + response). - `StartFrame` is back a system frame so we make sure it's processed immediately by all processors. `EndFrame` stays a control frame since it needs to be diff --git a/examples/foundational/05-sync-speech-and-image.py b/examples/foundational/05-sync-speech-and-image.py index dae860a92..5477d0691 100644 --- a/examples/foundational/05-sync-speech-and-image.py +++ b/examples/foundational/05-sync-speech-and-image.py @@ -86,13 +86,13 @@ async def main(): ), ) + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") + tts = CartesiaHttpTTSService( api_key=os.getenv("CARTESIA_API_KEY"), voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady ) - llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") - imagegen = FalImageGenService( params=FalImageGenService.InputParams(image_size="square_hd"), aiohttp_session=session, @@ -107,8 +107,10 @@ async def main(): # that, each pipeline runs concurrently and `SyncParallelPipeline` will # wait for the input frame to be processed. # - # Note that `SyncParallelPipeline` requires all processors in it to be - # synchronous (which is the default for most processors). + # Note that `SyncParallelPipeline` requires the last processor in each + # of the pipelines to be synchronous. In this case, we use + # `CartesiaHttpTTSService` and `FalImageGenService` which make HTTP + # requests and wait for the response. pipeline = Pipeline( [ llm, # LLM diff --git a/examples/foundational/05a-local-sync-speech-and-image.py b/examples/foundational/05a-local-sync-speech-and-image.py index 27c36f6ce..4a561c073 100644 --- a/examples/foundational/05a-local-sync-speech-and-image.py +++ b/examples/foundational/05a-local-sync-speech-and-image.py @@ -82,6 +82,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): self.frame = OutputAudioRawFrame( bytes(self.audio), frame.sample_rate, frame.num_channels ) + await self.push_frame(frame, direction) class ImageGrabber(FrameProcessor): def __init__(self): @@ -93,6 +94,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): if isinstance(frame, URLImageRawFrame): self.frame = frame + await self.push_frame(frame, direction) llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") @@ -121,8 +123,10 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # `SyncParallelPipeline` will wait for the input frame to be # processed. # - # Note that `SyncParallelPipeline` requires all processors in it to - # be synchronous (which is the default for most processors). + # Note that `SyncParallelPipeline` requires the last processor in + # each of the pipelines to be synchronous. In this case, we use + # `CartesiaHttpTTSService` and `FalImageGenService` which make HTTP + # requests and wait for the response. pipeline = Pipeline( [ llm, # LLM diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py index f71e066d7..f458f43ff 100644 --- a/src/pipecat/processors/frame_processor.py +++ b/src/pipecat/processors/frame_processor.py @@ -37,7 +37,6 @@ def __init__( *, name: str | None = None, metrics: FrameProcessorMetrics | None = None, - sync: bool = True, loop: asyncio.AbstractEventLoop | None = None, **kwargs, ): @@ -47,7 +46,6 @@ def __init__( self._prev: "FrameProcessor" | None = None self._next: "FrameProcessor" | None = None self._loop: asyncio.AbstractEventLoop = loop or asyncio.get_running_loop() - self._sync = sync self._event_handlers: dict = {} @@ -66,11 +64,8 @@ def __init__( # Every processor in Pipecat should only output frames from a single # task. This avoid problems like audio overlapping. System frames are - # the exception to this rule. - # - # This create this task. - if not self._sync: - self.__create_push_task() + # the exception to this rule. This create this task. + self.__create_push_task() @property def interruptions_allowed(self): @@ -167,7 +162,7 @@ async def push_error(self, error: ErrorFrame): await self.push_frame(error, FrameDirection.UPSTREAM) async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM): - if self._sync or isinstance(frame, SystemFrame): + if isinstance(frame, SystemFrame): await self.__internal_push_frame(frame, direction) else: await self.__push_queue.put((frame, direction)) @@ -194,13 +189,12 @@ def _register_event_handler(self, event_name: str): # async def _start_interruption(self): - if not self._sync: - # Cancel the task. This will stop pushing frames downstream. - self.__push_frame_task.cancel() - await self.__push_frame_task + # Cancel the task. This will stop pushing frames downstream. + self.__push_frame_task.cancel() + await self.__push_frame_task - # Create a new queue and task. - self.__create_push_task() + # Create a new queue and task. + self.__create_push_task() async def _stop_interruption(self): # Nothing to do right now. diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index f88660f60..7a6054c3c 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -516,7 +516,7 @@ def __init__( params: RTVIProcessorParams = RTVIProcessorParams(), **kwargs, ): - super().__init__(sync=False, **kwargs) + super().__init__(**kwargs) self._config = config self._params = params diff --git a/src/pipecat/processors/gstreamer/pipeline_source.py b/src/pipecat/processors/gstreamer/pipeline_source.py index 9f8471153..426eab50a 100644 --- a/src/pipecat/processors/gstreamer/pipeline_source.py +++ b/src/pipecat/processors/gstreamer/pipeline_source.py @@ -44,7 +44,7 @@ class OutputParams(BaseModel): clock_sync: bool = True def __init__(self, *, pipeline: str, out_params: OutputParams = OutputParams(), **kwargs): - super().__init__(sync=False, **kwargs) + super().__init__(**kwargs) self._out_params = out_params diff --git a/src/pipecat/processors/idle_frame_processor.py b/src/pipecat/processors/idle_frame_processor.py index 576cb9087..e674b6b84 100644 --- a/src/pipecat/processors/idle_frame_processor.py +++ b/src/pipecat/processors/idle_frame_processor.py @@ -26,7 +26,7 @@ def __init__( types: List[type] = [], **kwargs, ): - super().__init__(sync=False, **kwargs) + super().__init__(**kwargs) self._callback = callback self._timeout = timeout diff --git a/src/pipecat/processors/user_idle_processor.py b/src/pipecat/processors/user_idle_processor.py index 31d49cf5a..507dcb495 100644 --- a/src/pipecat/processors/user_idle_processor.py +++ b/src/pipecat/processors/user_idle_processor.py @@ -31,7 +31,7 @@ def __init__( timeout: float, **kwargs, ): - super().__init__(sync=False, **kwargs) + super().__init__(**kwargs) self._callback = callback self._timeout = timeout diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index ba78b24f8..d27a3277d 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -144,6 +144,10 @@ def __init__( # if True, TTSService will push TextFrames and LLMFullResponseEndFrames, # otherwise subclass must do it push_text_frames: bool = True, + # if True, TTSService will push TTSStoppedFrames, otherwise subclass must do it + push_stop_frames: bool = False, + # if push_stop_frames is True, wait for this idle period before pushing TTSStoppedFrame + stop_frame_timeout_s: float = 1.0, # TTS output sample rate sample_rate: int = 16000, **kwargs, @@ -151,9 +155,15 @@ def __init__( super().__init__(**kwargs) self._aggregate_sentences: bool = aggregate_sentences self._push_text_frames: bool = push_text_frames - self._current_sentence: str = "" + self._push_stop_frames: bool = push_stop_frames + self._stop_frame_timeout_s: float = stop_frame_timeout_s self._sample_rate: int = sample_rate + self._stop_frame_task: Optional[asyncio.Task] = None + self._stop_frame_queue: asyncio.Queue = asyncio.Queue() + + self._current_sentence: str = "" + @property def sample_rate(self) -> int: return self._sample_rate @@ -210,13 +220,72 @@ async def set_style_degree(self, style_degree: str): async def set_role(self, role: str): pass + @abstractmethod + async def flush_audio(self): + pass + # Converts the text to audio. @abstractmethod async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: pass + async def start(self, frame: StartFrame): + await super().start(frame) + if self._push_stop_frames: + self._stop_frame_task = self.get_event_loop().create_task(self._stop_frame_handler()) + + async def stop(self, frame: EndFrame): + await super().stop(frame) + if self._stop_frame_task: + self._stop_frame_task.cancel() + await self._stop_frame_task + self._stop_frame_task = None + + async def cancel(self, frame: CancelFrame): + await super().cancel(frame) + if self._stop_frame_task: + self._stop_frame_task.cancel() + await self._stop_frame_task + self._stop_frame_task = None + async def say(self, text: str): await self.process_frame(TextFrame(text=text), FrameDirection.DOWNSTREAM) + await self.flush_audio() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if isinstance(frame, TextFrame): + await self._process_text_frame(frame) + elif isinstance(frame, StartInterruptionFrame): + await self._handle_interruption(frame, direction) + elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(frame, EndFrame): + sentence = self._current_sentence + self._current_sentence = "" + await self._push_tts_frames(sentence) + if isinstance(frame, LLMFullResponseEndFrame): + if self._push_text_frames: + await self.push_frame(frame, direction) + else: + await self.push_frame(frame, direction) + elif isinstance(frame, TTSSpeakFrame): + await self._push_tts_frames(frame.text) + await self.flush_audio() + elif isinstance(frame, TTSUpdateSettingsFrame): + await self._update_tts_settings(frame) + else: + await self.push_frame(frame, direction) + + async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM): + await super().push_frame(frame, direction) + + if self._push_stop_frames and ( + isinstance(frame, StartInterruptionFrame) + or isinstance(frame, TTSStartedFrame) + or isinstance(frame, TTSAudioRawFrame) + or isinstance(frame, TTSStoppedFrame) + ): + await self._stop_frame_queue.put(frame) async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection): self._current_sentence = "" @@ -276,88 +345,6 @@ async def _update_tts_settings(self, frame: TTSUpdateSettingsFrame): if frame.role is not None: await self.set_role(frame.role) - async def process_frame(self, frame: Frame, direction: FrameDirection): - await super().process_frame(frame, direction) - - if isinstance(frame, TextFrame): - await self._process_text_frame(frame) - elif isinstance(frame, StartInterruptionFrame): - await self._handle_interruption(frame, direction) - elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(frame, EndFrame): - sentence = self._current_sentence - self._current_sentence = "" - await self._push_tts_frames(sentence) - if isinstance(frame, LLMFullResponseEndFrame): - if self._push_text_frames: - await self.push_frame(frame, direction) - else: - await self.push_frame(frame, direction) - elif isinstance(frame, TTSSpeakFrame): - await self._push_tts_frames(frame.text) - elif isinstance(frame, TTSUpdateSettingsFrame): - await self._update_tts_settings(frame) - else: - await self.push_frame(frame, direction) - - -class AsyncTTSService(TTSService): - def __init__( - self, - # if True, TTSService will push TTSStoppedFrames, otherwise subclass must do it - push_stop_frames: bool = False, - # if push_stop_frames is True, wait for this idle period before pushing TTSStoppedFrame - stop_frame_timeout_s: float = 1.0, - **kwargs, - ): - super().__init__(sync=False, **kwargs) - self._push_stop_frames: bool = push_stop_frames - self._stop_frame_timeout_s: float = stop_frame_timeout_s - self._stop_frame_task: Optional[asyncio.Task] = None - self._stop_frame_queue: asyncio.Queue = asyncio.Queue() - - @abstractmethod - async def flush_audio(self): - pass - - async def say(self, text: str): - await super().say(text) - await self.flush_audio() - - async def start(self, frame: StartFrame): - await super().start(frame) - if self._push_stop_frames: - self._stop_frame_task = self.get_event_loop().create_task(self._stop_frame_handler()) - - async def stop(self, frame: EndFrame): - await super().stop(frame) - if self._stop_frame_task: - self._stop_frame_task.cancel() - await self._stop_frame_task - self._stop_frame_task = None - - async def cancel(self, frame: CancelFrame): - await super().cancel(frame) - if self._stop_frame_task: - self._stop_frame_task.cancel() - await self._stop_frame_task - self._stop_frame_task = None - - async def process_frame(self, frame: Frame, direction: FrameDirection): - await super().process_frame(frame, direction) - if isinstance(frame, TTSSpeakFrame): - await self.flush_audio() - - async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM): - await super().push_frame(frame, direction) - - if self._push_stop_frames and ( - isinstance(frame, StartInterruptionFrame) - or isinstance(frame, TTSStartedFrame) - or isinstance(frame, TTSAudioRawFrame) - or isinstance(frame, TTSStoppedFrame) - ): - await self._stop_frame_queue.put(frame) - async def _stop_frame_handler(self): try: has_started = False @@ -378,7 +365,7 @@ async def _stop_frame_handler(self): pass -class AsyncWordTTSService(AsyncTTSService): +class WordTTSService(TTSService): def __init__(self, **kwargs): super().__init__(**kwargs) self._initial_word_timestamp = -1 diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index e38d56db3..5f798b1e5 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -26,7 +26,7 @@ ) from pipecat.processors.frame_processor import FrameDirection from pipecat.transcriptions.language import Language -from pipecat.services.ai_services import AsyncWordTTSService, TTSService +from pipecat.services.ai_services import WordTTSService, TTSService from loguru import logger @@ -61,7 +61,7 @@ def language_to_cartesia_language(language: Language) -> str | None: return None -class CartesiaTTSService(AsyncWordTTSService): +class CartesiaTTSService(WordTTSService): class InputParams(BaseModel): encoding: Optional[str] = "pcm_s16le" sample_rate: Optional[int] = 16000 diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py index ca4713f5f..611f2a024 100644 --- a/src/pipecat/services/elevenlabs.py +++ b/src/pipecat/services/elevenlabs.py @@ -23,7 +23,7 @@ TTSStoppedFrame, ) from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.ai_services import AsyncWordTTSService +from pipecat.services.ai_services import WordTTSService # See .env.example for ElevenLabs configuration needed try: @@ -70,7 +70,7 @@ def calculate_word_times( return word_times -class ElevenLabsTTSService(AsyncWordTTSService): +class ElevenLabsTTSService(WordTTSService): class InputParams(BaseModel): language: Optional[str] = None output_format: Literal["pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100"] = "pcm_16000" diff --git a/src/pipecat/services/gladia.py b/src/pipecat/services/gladia.py index 12183adde..a590d73cf 100644 --- a/src/pipecat/services/gladia.py +++ b/src/pipecat/services/gladia.py @@ -51,7 +51,7 @@ def __init__( params: InputParams = InputParams(), **kwargs, ): - super().__init__(sync=False, **kwargs) + super().__init__(**kwargs) self._api_key = api_key self._url = url diff --git a/src/pipecat/services/lmnt.py b/src/pipecat/services/lmnt.py index 1ac24d731..8f18002c5 100644 --- a/src/pipecat/services/lmnt.py +++ b/src/pipecat/services/lmnt.py @@ -20,7 +20,7 @@ TTSStartedFrame, TTSStoppedFrame, ) -from pipecat.services.ai_services import AsyncTTSService +from pipecat.services.ai_services import TTSService from loguru import logger @@ -35,7 +35,7 @@ raise Exception(f"Missing module: {e}") -class LmntTTSService(AsyncTTSService): +class LmntTTSService(TTSService): def __init__( self, *, @@ -47,7 +47,7 @@ def __init__( ): # Let TTSService produce TTSStoppedFrames after a short delay of # no activity. - super().__init__(sync=False, push_stop_frames=True, sample_rate=sample_rate, **kwargs) + super().__init__(push_stop_frames=True, sample_rate=sample_rate, **kwargs) self._api_key = api_key self._voice_id = voice_id diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py index df7babff1..710f8108a 100644 --- a/src/pipecat/transports/base_input.py +++ b/src/pipecat/transports/base_input.py @@ -31,7 +31,7 @@ class BaseInputTransport(FrameProcessor): def __init__(self, params: TransportParams, **kwargs): - super().__init__(sync=False, **kwargs) + super().__init__(**kwargs) self._params = params diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py index 941a3505a..c3b9c792b 100644 --- a/src/pipecat/transports/base_output.py +++ b/src/pipecat/transports/base_output.py @@ -43,7 +43,7 @@ class BaseOutputTransport(FrameProcessor): def __init__(self, params: TransportParams, **kwargs): - super().__init__(sync=False, **kwargs) + super().__init__(**kwargs) self._params = params From a90ebdfe7c881d5fe4d3c6850bfc556f9d1a3c04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 30 Sep 2024 11:36:59 -0700 Subject: [PATCH 47/60] syncparallelpipeline: fix now that all frames are asynchronous --- .../pipeline/sync_parallel_pipeline.py | 66 ++++++++++++++++--- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/src/pipecat/pipeline/sync_parallel_pipeline.py b/src/pipecat/pipeline/sync_parallel_pipeline.py index 854cea89d..20f4275e4 100644 --- a/src/pipecat/pipeline/sync_parallel_pipeline.py +++ b/src/pipecat/pipeline/sync_parallel_pipeline.py @@ -6,17 +6,25 @@ import asyncio +from dataclasses import dataclass from itertools import chain from typing import List +from pipecat.frames.frames import ControlFrame, EndFrame, Frame, SystemFrame from pipecat.pipeline.base_pipeline import BasePipeline from pipecat.pipeline.pipeline import Pipeline from pipecat.processors.frame_processor import FrameDirection, FrameProcessor -from pipecat.frames.frames import Frame from loguru import logger +@dataclass +class SyncFrame(ControlFrame): + """This frame is used to know when the internal pipelines have finished.""" + + pass + + class Source(FrameProcessor): def __init__(self, upstream_queue: asyncio.Queue): super().__init__() @@ -67,13 +75,16 @@ def __init__(self, *args): raise TypeError(f"SyncParallelPipeline argument {processors} is not a list") # We add a source at the beginning of the pipeline and a sink at the end. - source = Source(self._up_queue) - sink = Sink(self._down_queue) + up_queue = asyncio.Queue() + down_queue = asyncio.Queue() + source = Source(up_queue) + sink = Sink(down_queue) processors: List[FrameProcessor] = [source] + processors + [sink] - # Keep track of sources and sinks. - self._sources.append(source) - self._sinks.append(sink) + # Keep track of sources and sinks. We also keep the output queue of + # the source and the sinks so we can use it later. + self._sources.append({"processor": source, "queue": down_queue}) + self._sinks.append({"processor": sink, "queue": up_queue}) # Create pipeline pipeline = Pipeline(processors) @@ -94,17 +105,52 @@ def processors_with_metrics(self) -> List[FrameProcessor]: async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) + # The last processor of each pipeline needs to be synchronous otherwise + # this element won't work. Since, we know it should be synchronous we + # push a SyncFrame. Since frames are ordered we know this frame will be + # pushed after the synchronous processor has pushed its data allowing us + # to synchrnonize all the internal pipelines by waiting for the + # SyncFrame in all of them. + async def wait_for_sync( + obj, main_queue: asyncio.Queue, frame: Frame, direction: FrameDirection + ): + processor = obj["processor"] + queue = obj["queue"] + + await processor.process_frame(frame, direction) + + if isinstance(frame, (SystemFrame, EndFrame)): + new_frame = await queue.get() + if isinstance(new_frame, (SystemFrame, EndFrame)): + await main_queue.put(new_frame) + else: + while not isinstance(new_frame, (SystemFrame, EndFrame)): + await main_queue.put(new_frame) + queue.task_done() + new_frame = await queue.get() + else: + await processor.process_frame(SyncFrame(), direction) + new_frame = await queue.get() + while not isinstance(new_frame, SyncFrame): + await main_queue.put(new_frame) + queue.task_done() + new_frame = await queue.get() + if direction == FrameDirection.UPSTREAM: # If we get an upstream frame we process it in each sink. - await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sinks]) + await asyncio.gather( + *[wait_for_sync(s, self._up_queue, frame, direction) for s in self._sinks] + ) elif direction == FrameDirection.DOWNSTREAM: # If we get a downstream frame we process it in each source. - await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sources]) + await asyncio.gather( + *[wait_for_sync(s, self._down_queue, frame, direction) for s in self._sources] + ) seen_ids = set() while not self._up_queue.empty(): frame = await self._up_queue.get() - if frame and frame.id not in seen_ids: + if frame.id not in seen_ids: await self.push_frame(frame, FrameDirection.UPSTREAM) seen_ids.add(frame.id) self._up_queue.task_done() @@ -112,7 +158,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): seen_ids = set() while not self._down_queue.empty(): frame = await self._down_queue.get() - if frame and frame.id not in seen_ids: + if frame.id not in seen_ids: await self.push_frame(frame, FrameDirection.DOWNSTREAM) seen_ids.add(frame.id) self._down_queue.task_done() From d080a31a5ca322364f5645aafa3727d47ea32f64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 30 Sep 2024 11:37:26 -0700 Subject: [PATCH 48/60] tests: fix langchanin tests --- tests/test_langchain.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_langchain.py b/tests/test_langchain.py index fb222205b..d30d213bd 100644 --- a/tests/test_langchain.py +++ b/tests/test_langchain.py @@ -7,9 +7,9 @@ import unittest from pipecat.frames.frames import ( + EndFrame, LLMFullResponseEndFrame, LLMFullResponseStartFrame, - StopTaskFrame, TextFrame, TranscriptionFrame, UserStartedSpeakingFrame, @@ -32,6 +32,7 @@ class TestLangchain(unittest.IsolatedAsyncioTestCase): class MockProcessor(FrameProcessor): def __init__(self, name): + super().__init__() self.name = name self.token: list[str] = [] # Start collecting tokens when we see the start frame @@ -55,13 +56,13 @@ async def process_frame(self, frame, direction): def setUp(self): self.expected_response = "Hello dear human" self.fake_llm = FakeStreamingListLLM(responses=[self.expected_response]) - self.mock_proc = self.MockProcessor("token_collector") async def test_langchain(self): messages = [("system", "Say hello to {name}"), ("human", "{input}")] prompt = ChatPromptTemplate.from_messages(messages).partial(name="Thomas") chain = prompt | self.fake_llm proc = LangchainProcessor(chain=chain) + self.mock_proc = self.MockProcessor("token_collector") tma_in = LLMUserResponseAggregator(messages) tma_out = LLMAssistantResponseAggregator(messages) @@ -81,7 +82,7 @@ async def test_langchain(self): UserStartedSpeakingFrame(), TranscriptionFrame(text="Hi World", user_id="user", timestamp="now"), UserStoppedSpeakingFrame(), - StopTaskFrame(), + EndFrame(), ] ) From 4d1e370e02b411142ad78bb80743fd2d58efccb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 30 Sep 2024 11:56:59 -0700 Subject: [PATCH 49/60] pipeline(task): since everything is async tasks should wait for EndFrame --- src/pipecat/pipeline/task.py | 43 ++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/src/pipecat/pipeline/task.py b/src/pipecat/pipeline/task.py index f79ff6f39..96845430d 100644 --- a/src/pipecat/pipeline/task.py +++ b/src/pipecat/pipeline/task.py @@ -69,6 +69,19 @@ async def _handle_upstream_frame(self, frame: Frame): await self._up_queue.put(StopTaskFrame()) +class Sink(FrameProcessor): + def __init__(self, down_queue: asyncio.Queue): + super().__init__() + self._down_queue = down_queue + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + # We really just want to know when the EndFrame reached the sink. + if isinstance(frame, EndFrame): + await self._down_queue.put(frame) + + class PipelineTask: def __init__( self, @@ -84,12 +97,16 @@ def __init__( self._params = params self._finished = False - self._down_queue = asyncio.Queue() self._up_queue = asyncio.Queue() + self._down_queue = asyncio.Queue() + self._push_queue = asyncio.Queue() self._source = Source(self._up_queue) self._source.link(pipeline) + self._sink = Sink(self._down_queue) + pipeline.link(self._sink) + def has_finished(self): return self._finished @@ -103,19 +120,19 @@ async def cancel(self): # out-of-band from the main streaming task which is what we want since # we want to cancel right away. await self._source.push_frame(CancelFrame()) - self._process_down_task.cancel() + self._process_push_task.cancel() self._process_up_task.cancel() - await self._process_down_task + await self._process_push_task await self._process_up_task async def run(self): self._process_up_task = asyncio.create_task(self._process_up_queue()) - self._process_down_task = asyncio.create_task(self._process_down_queue()) - await asyncio.gather(self._process_up_task, self._process_down_task) + self._process_push_task = asyncio.create_task(self._process_push_queue()) + await asyncio.gather(self._process_up_task, self._process_push_task) self._finished = True async def queue_frame(self, frame: Frame): - await self._down_queue.put(frame) + await self._push_queue.put(frame) async def queue_frames(self, frames: Iterable[Frame] | AsyncIterable[Frame]): if isinstance(frames, AsyncIterable): @@ -133,7 +150,7 @@ def _initial_metrics_frame(self) -> MetricsFrame: data.append(ProcessingMetricsData(processor=p.name, value=0.0)) return MetricsFrame(data=data) - async def _process_down_queue(self): + async def _process_push_queue(self): self._clock.start() start_frame = StartFrame( @@ -154,11 +171,13 @@ async def _process_down_queue(self): should_cleanup = True while running: try: - frame = await self._down_queue.get() + frame = await self._push_queue.get() await self._source.process_frame(frame, FrameDirection.DOWNSTREAM) + if isinstance(frame, EndFrame): + await self._wait_for_endframe() running = not (isinstance(frame, StopTaskFrame) or isinstance(frame, EndFrame)) should_cleanup = not isinstance(frame, StopTaskFrame) - self._down_queue.task_done() + self._push_queue.task_done() except asyncio.CancelledError: break # Cleanup only if we need to. @@ -169,6 +188,12 @@ async def _process_down_queue(self): self._process_up_task.cancel() await self._process_up_task + async def _wait_for_endframe(self): + # NOTE(aleix): the Sink element just pushes EndFrames to the down queue, + # so just wait for it. In the future we might do something else here, + # but for now this is fine. + await self._down_queue.get() + async def _process_up_queue(self): while True: try: From f8a75cede901c9f614e83c4d62ce24fc7a97a410 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Mon, 30 Sep 2024 18:22:38 -0400 Subject: [PATCH 50/60] Update daily-python to 0.11.0 --- pyproject.toml | 2 +- test-requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e31755d50..a29697bdb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ anthropic = [ "anthropic~=0.34.0" ] aws = [ "boto3~=1.35.27" ] azure = [ "azure-cognitiveservices-speech~=1.40.0" ] cartesia = [ "cartesia~=1.0.13", "websockets~=12.0" ] -daily = [ "daily-python~=0.10.1" ] +daily = [ "daily-python~=0.11.0" ] deepgram = [ "deepgram-sdk~=3.5.0" ] elevenlabs = [ "websockets~=12.0" ] examples = [ "python-dotenv~=1.0.1", "flask~=3.0.3", "flask_cors~=4.0.1" ] diff --git a/test-requirements.txt b/test-requirements.txt index 8c7db7377..07ef45054 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -2,7 +2,7 @@ aiohttp~=3.10.3 anthropic~=0.30.0 azure-cognitiveservices-speech~=1.40.0 boto3~=1.35.27 -daily-python~=0.10.1 +daily-python~=0.11.0 deepgram-sdk~=3.5.0 fal-client~=0.4.1 fastapi~=0.112.1 From 69c7edd60c8d586bf786f7f442e2217d13d9db42 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sat, 28 Sep 2024 21:37:03 -0700 Subject: [PATCH 51/60] pushing context frames from assistant aggregators --- src/pipecat/services/anthropic.py | 5 ++++- src/pipecat/services/openai.py | 5 ++++- src/pipecat/services/together.py | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py index bc91e4e16..6a535ef15 100644 --- a/src/pipecat/services/anthropic.py +++ b/src/pipecat/services/anthropic.py @@ -579,7 +579,7 @@ async def _push_aggregation(self): run_llm = False aggregation = self._aggregation - self._aggregation = "" + self._reset() try: if self._function_call_result: @@ -630,5 +630,8 @@ async def _push_aggregation(self): if run_llm: await self._user_context_aggregator.push_context_frame() + frame = OpenAILLMContextFrame(self._context) + await self.push_frame(frame) + except Exception as e: logger.error(f"Error processing frame: {e}") diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index f0892b9ca..99d2d7497 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -495,7 +495,7 @@ async def _push_aggregation(self): run_llm = False aggregation = self._aggregation - self._aggregation = "" + self._reset() try: if self._function_call_result: @@ -531,5 +531,8 @@ async def _push_aggregation(self): if run_llm: await self._user_context_aggregator.push_context_frame() + frame = OpenAILLMContextFrame(self._context) + await self.push_frame(frame) + except Exception as e: logger.error(f"Error processing frame: {e}") diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py index e4068ecfc..935f625ad 100644 --- a/src/pipecat/services/together.py +++ b/src/pipecat/services/together.py @@ -370,7 +370,7 @@ async def _push_aggregation(self): run_llm = False aggregation = self._aggregation - self._aggregation = "" + self._reset() try: if self._function_call_result: @@ -390,5 +390,8 @@ async def _push_aggregation(self): if run_llm: await self._user_context_aggregator.push_messages_frame() + frame = OpenAILLMContextFrame(self._context) + await self.push_frame(frame) + except Exception as e: logger.error(f"Error processing frame: {e}") From 37da7e44cdbd7ec3d580926320b4cc5edc9ae450 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sat, 28 Sep 2024 22:01:40 -0700 Subject: [PATCH 52/60] whitespace fix --- .../07l-interruptible-together.py | 27 +++++++++---------- .../processors/aggregators/llm_response.py | 15 +++++------ 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/examples/foundational/07l-interruptible-together.py b/examples/foundational/07l-interruptible-together.py index e2cb55fed..a7086c941 100644 --- a/examples/foundational/07l-interruptible-together.py +++ b/examples/foundational/07l-interruptible-together.py @@ -5,29 +5,24 @@ # import asyncio -import aiohttp import os import sys +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.llm_response import ( - LLMAssistantResponseAggregator, - LLMUserResponseAggregator, -) +from pipecat.services.ai_services import OpenAILLMContext from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.together import TogetherLLMService from pipecat.transports.services.daily import DailyParams, DailyTransport from pipecat.vad.silero import SileroVADAnalyzer -from runner import configure - -from loguru import logger - -from dotenv import load_dotenv - load_dotenv(override=True) logger.remove(0) @@ -76,17 +71,19 @@ async def main(): }, ] - tma_in = LLMUserResponseAggregator(messages) - tma_out = LLMAssistantResponseAggregator(messages) + context = OpenAILLMContext(messages, tools) + context_aggregator = llm.create_context_aggregator(context) + user_aggregator = context_aggregator.user() + assistant_aggregator = context_aggregator.assistant() pipeline = Pipeline( [ transport.input(), # Transport user input - tma_in, # User responses + user_aggregator, # User responses llm, # LLM tts, # TTS transport.output(), # Transport bot output - tma_out, # Assistant spoken responses + assistant_aggregator, # Assistant spoken responses ] ) diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py index 036f5fe47..a3cd63cbd 100644 --- a/src/pipecat/processors/aggregators/llm_response.py +++ b/src/pipecat/processors/aggregators/llm_response.py @@ -6,12 +6,6 @@ from typing import List, Type -from pipecat.processors.aggregators.openai_llm_context import ( - OpenAILLMContextFrame, - OpenAILLMContext, -) - -from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.frames.frames import ( Frame, InterimTranscriptionFrame, @@ -22,11 +16,16 @@ LLMMessagesUpdateFrame, LLMSetToolsFrame, StartInterruptionFrame, - TranscriptionFrame, TextFrame, + TranscriptionFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame, ) +from pipecat.processors.aggregators.openai_llm_context import ( + OpenAILLMContext, + OpenAILLMContextFrame, +) +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor class LLMResponseAggregator(FrameProcessor): @@ -111,7 +110,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await self.push_frame(frame, direction) elif isinstance(frame, self._accumulator_frame): if self._aggregating: - self._aggregation += f" {frame.text}" if self._aggregation else frame.text + self._aggregation += frame.text if self._aggregation else frame.text # We have recevied a complete sentence, so if we have seen the # end frame and we were still aggregating, it means we should # send the aggregation. From ed607d5c4b30695c3d7a9a669b3ac707ec62647d Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sat, 28 Sep 2024 22:02:33 -0700 Subject: [PATCH 53/60] typo fix --- examples/foundational/07l-interruptible-together.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/foundational/07l-interruptible-together.py b/examples/foundational/07l-interruptible-together.py index a7086c941..ca3386718 100644 --- a/examples/foundational/07l-interruptible-together.py +++ b/examples/foundational/07l-interruptible-together.py @@ -71,7 +71,7 @@ async def main(): }, ] - context = OpenAILLMContext(messages, tools) + context = OpenAILLMContext(messages) context_aggregator = llm.create_context_aggregator(context) user_aggregator = context_aggregator.user() assistant_aggregator = context_aggregator.assistant() From fef393dcacf4e1c94d7a819cf3ea35722fbc8a67 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sun, 29 Sep 2024 14:11:17 -0700 Subject: [PATCH 54/60] assistant aggregator switch for space padding or not --- src/pipecat/processors/aggregators/llm_response.py | 10 ++++++++-- src/pipecat/services/anthropic.py | 12 ++++++++---- src/pipecat/services/openai.py | 12 ++++++++---- src/pipecat/services/together.py | 12 ++++++++---- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py index a3cd63cbd..4ea38b89f 100644 --- a/src/pipecat/processors/aggregators/llm_response.py +++ b/src/pipecat/processors/aggregators/llm_response.py @@ -39,6 +39,7 @@ def __init__( accumulator_frame: Type[TextFrame], interim_accumulator_frame: Type[TextFrame] | None = None, handle_interruptions: bool = False, + expect_stripped_words: bool = True, # if True, need to add spaces between words ): super().__init__() @@ -49,6 +50,7 @@ def __init__( self._accumulator_frame = accumulator_frame self._interim_accumulator_frame = interim_accumulator_frame self._handle_interruptions = handle_interruptions + self._expect_stripped_words = expect_stripped_words # Reset our accumulator state. self._reset() @@ -110,7 +112,10 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await self.push_frame(frame, direction) elif isinstance(frame, self._accumulator_frame): if self._aggregating: - self._aggregation += frame.text if self._aggregation else frame.text + if self._expect_stripped_words: + self._aggregation += f" {frame.text}" if self._aggregation else frame.text + else: + self._aggregation += frame.text if self._aggregation else frame.text # We have recevied a complete sentence, so if we have seen the # end frame and we were still aggregating, it means we should # send the aggregation. @@ -289,7 +294,7 @@ async def _push_aggregation(self): class LLMAssistantContextAggregator(LLMContextAggregator): - def __init__(self, context: OpenAILLMContext): + def __init__(self, context: OpenAILLMContext, *, expect_stripped_words: bool = True): super().__init__( messages=[], context=context, @@ -298,6 +303,7 @@ def __init__(self, context: OpenAILLMContext): end_frame=LLMFullResponseEndFrame, accumulator_frame=TextFrame, handle_interruptions=True, + expect_stripped_words=expect_stripped_words, ) diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py index 6a535ef15..86e1e3726 100644 --- a/src/pipecat/services/anthropic.py +++ b/src/pipecat/services/anthropic.py @@ -110,9 +110,13 @@ def enable_prompt_caching_beta(self) -> bool: return self._enable_prompt_caching_beta @staticmethod - def create_context_aggregator(context: OpenAILLMContext) -> AnthropicContextAggregatorPair: + def create_context_aggregator( + context: OpenAILLMContext, *, assistant_expect_stripped_words: bool = True + ) -> AnthropicContextAggregatorPair: user = AnthropicUserContextAggregator(context) - assistant = AnthropicAssistantContextAggregator(user) + assistant = AnthropicAssistantContextAggregator( + user, expect_stripped_words=assistant_expect_stripped_words + ) return AnthropicContextAggregatorPair(_user=user, _assistant=assistant) async def set_enable_prompt_caching_beta(self, enable_prompt_caching_beta: bool): @@ -541,8 +545,8 @@ async def process_frame(self, frame, direction): class AnthropicAssistantContextAggregator(LLMAssistantContextAggregator): - def __init__(self, user_context_aggregator: AnthropicUserContextAggregator): - super().__init__(context=user_context_aggregator._context) + def __init__(self, user_context_aggregator: AnthropicUserContextAggregator, **kwargs): + super().__init__(context=user_context_aggregator._context, **kwargs) self._user_context_aggregator = user_context_aggregator self._function_call_in_progress = None self._function_call_result = None diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index 99d2d7497..c17916f2d 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -336,9 +336,13 @@ def __init__( super().__init__(model=model, params=params, **kwargs) @staticmethod - def create_context_aggregator(context: OpenAILLMContext) -> OpenAIContextAggregatorPair: + def create_context_aggregator( + context: OpenAILLMContext, *, assistant_expect_stripped_words: bool = True + ) -> OpenAIContextAggregatorPair: user = OpenAIUserContextAggregator(context) - assistant = OpenAIAssistantContextAggregator(user) + assistant = OpenAIAssistantContextAggregator( + user, expect_stripped_words=assistant_expect_stripped_words + ) return OpenAIContextAggregatorPair(_user=user, _assistant=assistant) @@ -458,8 +462,8 @@ def __init__(self, context: OpenAILLMContext): class OpenAIAssistantContextAggregator(LLMAssistantContextAggregator): - def __init__(self, user_context_aggregator: OpenAIUserContextAggregator): - super().__init__(context=user_context_aggregator._context) + def __init__(self, user_context_aggregator: OpenAIUserContextAggregator, **kwargs): + super().__init__(context=user_context_aggregator._context, **kwargs) self._user_context_aggregator = user_context_aggregator self._function_call_in_progress = None self._function_call_result = None diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py index 935f625ad..3f4d97964 100644 --- a/src/pipecat/services/together.py +++ b/src/pipecat/services/together.py @@ -95,9 +95,13 @@ def can_generate_metrics(self) -> bool: return True @staticmethod - def create_context_aggregator(context: OpenAILLMContext) -> TogetherContextAggregatorPair: + def create_context_aggregator( + context: OpenAILLMContext, *, assistant_expect_stripped_words: bool = True + ) -> TogetherContextAggregatorPair: user = TogetherUserContextAggregator(context) - assistant = TogetherAssistantContextAggregator(user) + assistant = TogetherAssistantContextAggregator( + user, expect_stripped_words=assistant_expect_stripped_words + ) return TogetherContextAggregatorPair(_user=user, _assistant=assistant) async def set_frequency_penalty(self, frequency_penalty: float): @@ -331,8 +335,8 @@ async def process_frame(self, frame, direction): class TogetherAssistantContextAggregator(LLMAssistantContextAggregator): - def __init__(self, user_context_aggregator: TogetherUserContextAggregator): - super().__init__(context=user_context_aggregator._context) + def __init__(self, user_context_aggregator: TogetherUserContextAggregator, **kwargs): + super().__init__(context=user_context_aggregator._context, **kwargs) self._user_context_aggregator = user_context_aggregator self._function_call_in_progress = None self._function_call_result = None From 539e0b66fb5abb446358c0b21ece7162fab63237 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Mon, 30 Sep 2024 09:26:06 -0700 Subject: [PATCH 55/60] small fix as per aleix --- src/pipecat/processors/aggregators/llm_response.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py index 4ea38b89f..479746471 100644 --- a/src/pipecat/processors/aggregators/llm_response.py +++ b/src/pipecat/processors/aggregators/llm_response.py @@ -115,7 +115,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): if self._expect_stripped_words: self._aggregation += f" {frame.text}" if self._aggregation else frame.text else: - self._aggregation += frame.text if self._aggregation else frame.text + self._aggregation += frame.text # We have recevied a complete sentence, so if we have seen the # end frame and we were still aggregating, it means we should # send the aggregation. From def04ac0ce0fe6f90c66e523066c3b4517dbc8d3 Mon Sep 17 00:00:00 2001 From: JeevanReddy Date: Wed, 7 Aug 2024 13:07:18 +0530 Subject: [PATCH 56/60] openai can give multiple tool calls, current implementation assumes only one function call at a time. Fixed this to handle multiple function calls. --- src/pipecat/services/openai.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index c17916f2d..b17dd7397 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -205,6 +205,10 @@ async def _stream_chat_completions( return chunks async def _process_context(self, context: OpenAILLMContext): + functions_list = [] + arguments_list = [] + tool_id_list = [] + func_idx = 0 function_name = "" arguments = "" tool_call_id = "" @@ -242,6 +246,14 @@ async def _process_context(self, context: OpenAILLMContext): # yield a frame containing the function name and the arguments. tool_call = chunk.choices[0].delta.tool_calls[0] + if tool_call.index != func_idx: + functions_list.append(function_name) + arguments_list.append(arguments) + tool_id_list.append(tool_call_id) + function_name = "" + arguments = "" + tool_call_id = "" + func_idx += 1 if tool_call.function and tool_call.function.name: function_name += tool_call.function.name tool_call_id = tool_call.id @@ -257,12 +269,21 @@ async def _process_context(self, context: OpenAILLMContext): # the context, and re-prompt to get a chat answer. If we don't have a registered # handler, raise an exception. if function_name and arguments: - if self.has_function(function_name): - await self._handle_function_call(context, tool_call_id, function_name, arguments) - else: - raise OpenAIUnhandledFunctionException( - f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function." - ) + # added to the list as last function name and arguments not added to the list + functions_list.append(function_name) + arguments_list.append(arguments) + tool_id_list.append(tool_call_id) + for function_name, arguments, tool_id in zip( + functions_list, arguments_list, tool_id_list + ): + if self.has_function(function_name): + await self._handle_function_call(context, tool_id, function_name, arguments) + else: + raise OpenAIUnhandledFunctionException( + f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function." + ) + # re-prompt to get a human answer after all the functions are called + await self._process_context(context) async def _handle_function_call(self, context, tool_call_id, function_name, arguments): arguments = json.loads(arguments) From a5c73ec829685f302b3fdb8450de2ac75297b72e Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sun, 29 Sep 2024 21:03:59 -0700 Subject: [PATCH 57/60] handle openai multiple function calls --- examples/foundational/14-function-calling.py | 11 ++-- src/pipecat/frames/frames.py | 1 + .../aggregators/openai_llm_context.py | 2 + src/pipecat/services/ai_services.py | 15 +++++- src/pipecat/services/openai.py | 54 +++++++++---------- 5 files changed, 50 insertions(+), 33 deletions(-) diff --git a/examples/foundational/14-function-calling.py b/examples/foundational/14-function-calling.py index b5aba449c..9141029ca 100644 --- a/examples/foundational/14-function-calling.py +++ b/examples/foundational/14-function-calling.py @@ -34,7 +34,12 @@ async def start_fetch_weather(function_name, llm, context): - await llm.push_frame(TextFrame("Let me check on that.")) + # note: we can't push a frame to the LLM here. the bot + # can interrupt itself and/or cause audio overlapping glitches. + # possible question for Aleix and Chad about what the right way + # to trigger speech is, now, with the new queues/async/sync refactors. + # await llm.push_frame(TextFrame("Let me check on that.")) + logger.debug(f"Starting fetch_weather_from_api with function_name: {function_name}") async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback): @@ -106,11 +111,11 @@ async def main(): pipeline = Pipeline( [ - fl_in, + # fl_in, transport.input(), context_aggregator.user(), llm, - fl_out, + # fl_out, tts, transport.output(), context_aggregator.assistant(), diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 8059b904b..f7faa8ef0 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -585,6 +585,7 @@ class FunctionCallResultFrame(DataFrame): tool_call_id: str arguments: str result: Any + run_llm: bool = True @dataclass diff --git a/src/pipecat/processors/aggregators/openai_llm_context.py b/src/pipecat/processors/aggregators/openai_llm_context.py index 83ec3e57f..4bf3f042c 100644 --- a/src/pipecat/processors/aggregators/openai_llm_context.py +++ b/src/pipecat/processors/aggregators/openai_llm_context.py @@ -133,6 +133,7 @@ async def call_function( tool_call_id: str, arguments: str, llm: FrameProcessor, + run_llm: bool = True, ) -> None: # Push a SystemFrame downstream. This frame will let our assistant context aggregator # know that we are in the middle of a function call. Some contexts/aggregators may @@ -153,6 +154,7 @@ async def function_call_result_callback(result): tool_call_id=tool_call_id, arguments=arguments, result=result, + run_llm=run_llm, ) ) diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 5eadb475b..a46ad3fab 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -110,7 +110,13 @@ def has_function(self, function_name: str): return function_name in self._callbacks.keys() async def call_function( - self, *, context: OpenAILLMContext, tool_call_id: str, function_name: str, arguments: str + self, + *, + context: OpenAILLMContext, + tool_call_id: str, + function_name: str, + arguments: str, + run_llm: bool, ) -> None: f = None if function_name in self._callbacks.keys(): @@ -120,7 +126,12 @@ async def call_function( else: return None await context.call_function( - f, function_name=function_name, tool_call_id=tool_call_id, arguments=arguments, llm=self + f, + function_name=function_name, + tool_call_id=tool_call_id, + arguments=arguments, + llm=self, + run_llm=run_llm, ) # QUESTION FOR CB: maybe this isn't needed anymore? diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index b17dd7397..73dae4644 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -273,26 +273,21 @@ async def _process_context(self, context: OpenAILLMContext): functions_list.append(function_name) arguments_list.append(arguments) tool_id_list.append(tool_call_id) - for function_name, arguments, tool_id in zip( - functions_list, arguments_list, tool_id_list + + total_items = len(functions_list) + for index, (function_name, arguments, tool_id) in enumerate( + zip(functions_list, arguments_list, tool_id_list), start=1 ): if self.has_function(function_name): - await self._handle_function_call(context, tool_id, function_name, arguments) - else: - raise OpenAIUnhandledFunctionException( - f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function." + run_llm = index == total_items + arguments = json.loads(arguments) + await self.call_function( + context=context, + function_name=function_name, + arguments=arguments, + tool_call_id=tool_id, + run_llm=run_llm, ) - # re-prompt to get a human answer after all the functions are called - await self._process_context(context) - - async def _handle_function_call(self, context, tool_call_id, function_name, arguments): - arguments = json.loads(arguments) - await self.call_function( - context=context, - tool_call_id=tool_call_id, - function_name=function_name, - arguments=arguments, - ) async def _update_settings(self, frame: LLMUpdateSettingsFrame): if frame.model is not None: @@ -486,31 +481,34 @@ class OpenAIAssistantContextAggregator(LLMAssistantContextAggregator): def __init__(self, user_context_aggregator: OpenAIUserContextAggregator, **kwargs): super().__init__(context=user_context_aggregator._context, **kwargs) self._user_context_aggregator = user_context_aggregator - self._function_call_in_progress = None + self._function_calls_in_progress = {} self._function_call_result = None async def process_frame(self, frame, direction): await super().process_frame(frame, direction) # See note above about not calling push_frame() here. if isinstance(frame, StartInterruptionFrame): - self._function_call_in_progress = None + self._function_calls_in_progress.clear() self._function_call_finished = None + logger.debug("clearing function calls in progress") elif isinstance(frame, FunctionCallInProgressFrame): - self._function_call_in_progress = frame + self._function_calls_in_progress[frame.tool_call_id] = frame + logger.debug( + f"FunctionCallInProgressFrame: {frame.tool_call_id} {self._function_calls_in_progress}" + ) elif isinstance(frame, FunctionCallResultFrame): - if ( - self._function_call_in_progress - and self._function_call_in_progress.tool_call_id == frame.tool_call_id - ): - self._function_call_in_progress = None + logger.debug( + f"FunctionCallResultFrame: {frame.tool_call_id} {self._function_calls_in_progress}" + ) + if frame.tool_call_id in self._function_calls_in_progress: + del self._function_calls_in_progress[frame.tool_call_id] self._function_call_result = frame # TODO-CB: Kwin wants us to refactor this out of here but I REFUSE await self._push_aggregation() else: logger.warning( - "FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id" + "FunctionCallResultFrame tool_call_id does not match any function call in progress" ) - self._function_call_in_progress = None self._function_call_result = None async def _push_aggregation(self): @@ -549,7 +547,7 @@ async def _push_aggregation(self): "tool_call_id": frame.tool_call_id, } ) - run_llm = True + run_llm = frame.run_llm else: self._context.add_message({"role": "assistant", "content": aggregation}) From 6ad3437fd2b3511b7aef91d4f9785b30ff0dc1ec Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sun, 29 Sep 2024 21:10:21 -0700 Subject: [PATCH 58/60] throw error if the llm tries to call a function that's not registered --- src/pipecat/services/openai.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index 73dae4644..8a032ea40 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -288,6 +288,10 @@ async def _process_context(self, context: OpenAILLMContext): tool_call_id=tool_id, run_llm=run_llm, ) + else: + raise OpenAIUnhandledFunctionException( + f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function." + ) async def _update_settings(self, frame: LLMUpdateSettingsFrame): if frame.model is not None: From 0499fe41e455c700c2de11a1eebb84cc3f71a573 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sun, 29 Sep 2024 21:12:09 -0700 Subject: [PATCH 59/60] get rid of some debug log lines used during development --- src/pipecat/services/openai.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index 8a032ea40..49fd04371 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -494,16 +494,9 @@ async def process_frame(self, frame, direction): if isinstance(frame, StartInterruptionFrame): self._function_calls_in_progress.clear() self._function_call_finished = None - logger.debug("clearing function calls in progress") elif isinstance(frame, FunctionCallInProgressFrame): self._function_calls_in_progress[frame.tool_call_id] = frame - logger.debug( - f"FunctionCallInProgressFrame: {frame.tool_call_id} {self._function_calls_in_progress}" - ) elif isinstance(frame, FunctionCallResultFrame): - logger.debug( - f"FunctionCallResultFrame: {frame.tool_call_id} {self._function_calls_in_progress}" - ) if frame.tool_call_id in self._function_calls_in_progress: del self._function_calls_in_progress[frame.tool_call_id] self._function_call_result = frame From 128355add5b80a7694e92472a24e438bbee2c3c5 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Mon, 30 Sep 2024 16:19:31 -0700 Subject: [PATCH 60/60] fix for multi-sentence tts say utterances --- src/pipecat/services/ai_services.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 5eadb475b..3e39b5443 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -249,7 +249,10 @@ async def cancel(self, frame: CancelFrame): self._stop_frame_task = None async def say(self, text: str): + aggregate_sentences = self._aggregate_sentences + self._aggregate_sentences = False await self.process_frame(TextFrame(text=text), FrameDirection.DOWNSTREAM) + self._aggregate_sentences = aggregate_sentences await self.flush_audio() async def process_frame(self, frame: Frame, direction: FrameDirection):