From fa0deededa5c27cfbc190b207009626c3c00aabc Mon Sep 17 00:00:00 2001
From: Jin Kim <golbin@gmail.com>
Date: Mon, 9 Sep 2024 10:53:23 +0900
Subject: [PATCH 01/60] Add voice options and make to use InputParams for
 Cartesia.

---
 .../07d-interruptible-cartesia.py             |  4 +-
 .../12c-describe-video-anthropic.py           |  4 +-
 examples/studypal/studypal.py                 |  4 +-
 src/pipecat/services/cartesia.py              | 45 +++++++++++++------
 4 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/examples/foundational/07d-interruptible-cartesia.py b/examples/foundational/07d-interruptible-cartesia.py
index 6b8bbcc5f..7bcc7476b 100644
--- a/examples/foundational/07d-interruptible-cartesia.py
+++ b/examples/foundational/07d-interruptible-cartesia.py
@@ -52,7 +52,9 @@ async def main():
         tts = CartesiaTTSService(
             api_key=os.getenv("CARTESIA_API_KEY"),
             voice_id="a0e99841-438c-4a64-b679-ae501e7d6091",  # Barbershop Man
-            sample_rate=44100,
+            params=CartesiaTTSService.InputParams(
+                sample_rate=44100,
+            ),
         )
 
         llm = OpenAILLMService(
diff --git a/examples/foundational/12c-describe-video-anthropic.py b/examples/foundational/12c-describe-video-anthropic.py
index cc1f14c92..8531debf8 100644
--- a/examples/foundational/12c-describe-video-anthropic.py
+++ b/examples/foundational/12c-describe-video-anthropic.py
@@ -78,7 +78,9 @@ async def main():
         tts = CartesiaTTSService(
             api_key=os.getenv("CARTESIA_API_KEY"),
             voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
-            sample_rate=16000,
+            params=CartesiaTTSService.InputParams(
+                sample_rate=16000,
+            ),
         )
 
         @transport.event_handler("on_first_participant_joined")
diff --git a/examples/studypal/studypal.py b/examples/studypal/studypal.py
index 8adfe2954..f14bd3def 100644
--- a/examples/studypal/studypal.py
+++ b/examples/studypal/studypal.py
@@ -124,7 +124,9 @@ async def main():
             api_key=os.getenv("CARTESIA_API_KEY"),
             voice_id=os.getenv("CARTESIA_VOICE_ID", "4d2fd738-3b3d-4368-957a-bb4805275bd9"),
             # British Narration Lady: 4d2fd738-3b3d-4368-957a-bb4805275bd9
-            sample_rate=44100,
+            params=CartesiaTTSService.InputParams(
+                sample_rate=44100,
+            ),
         )
 
         llm = OpenAILLMService(
diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index e3541ccea..927da53f0 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -10,7 +10,8 @@
 import asyncio
 import time
 
-from typing import AsyncGenerator, Mapping
+from typing import AsyncGenerator, Optional
+from pydantic.main import BaseModel
 
 from pipecat.frames.frames import (
     CancelFrame,
@@ -61,6 +62,14 @@ def language_to_cartesia_language(language: Language) -> str | None:
 
 
 class CartesiaTTSService(TTSService):
+    class InputParams(BaseModel):
+        model_id: Optional[str] = "sonic-english"
+        encoding: Optional[str] = "pcm_s16le"
+        sample_rate: Optional[int] = 16000
+        container: Optional[str] = "raw"
+        language: Optional[str] = "en"
+        speed: Optional[str] = None
+        emotion: Optional[list[str]] = []
 
     def __init__(
             self,
@@ -69,10 +78,7 @@ def __init__(
             voice_id: str,
             cartesia_version: str = "2024-06-10",
             url: str = "wss://api.cartesia.ai/tts/websocket",
-            model_id: str = "sonic-english",
-            encoding: str = "pcm_s16le",
-            sample_rate: int = 16000,
-            language: str = "en",
+            params: InputParams = InputParams(),
             **kwargs):
         super().__init__(**kwargs)
 
@@ -92,13 +98,15 @@ def __init__(
         self._cartesia_version = cartesia_version
         self._url = url
         self._voice_id = voice_id
-        self._model_id = model_id
+        self._model_id = params.model_id
         self._output_format = {
-            "container": "raw",
-            "encoding": encoding,
-            "sample_rate": sample_rate,
+            "container": params.container,
+            "encoding": params.encoding,
+            "sample_rate": params.sample_rate,
         }
-        self._language = language
+        self._language = params.language
+        self._speed = params.speed
+        self._emotion = params.emotion
 
         self._websocket = None
         self._context_id = None
@@ -249,15 +257,24 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 await self.start_ttfb_metrics()
                 self._context_id = str(uuid.uuid4())
 
+            voice_config = {
+                "mode": "id",
+                "id": self._voice_id
+            }
+
+            if self._speed or self._emotion:
+                voice_config["__experimental_controls"] = {}
+                if self._speed:
+                    voice_config["__experimental_controls"]["speed"] = self._speed
+                if self._emotion:
+                    voice_config["__experimental_controls"]["emotion"] = self._emotion
+
             msg = {
                 "transcript": text + " ",
                 "continue": True,
                 "context_id": self._context_id,
                 "model_id": self._model_id,
-                "voice": {
-                    "mode": "id",
-                    "id": self._voice_id
-                },
+                "voice": voice_config,
                 "output_format": self._output_format,
                 "language": self._language,
                 "add_timestamps": True,

From 2da0ecbe3c7326d180125d16f32ee8f95113f847 Mon Sep 17 00:00:00 2001
From: Jin Kim <golbin@gmail.com>
Date: Wed, 18 Sep 2024 00:38:12 +0900
Subject: [PATCH 02/60] Revert "model_id" as a main argument

---
 src/pipecat/services/cartesia.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index 25f54bf11..45e42470e 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -63,7 +63,6 @@ def language_to_cartesia_language(language: Language) -> str | None:
 
 class CartesiaTTSService(AsyncWordTTSService):
     class InputParams(BaseModel):
-        model_id: Optional[str] = "sonic-english"
         encoding: Optional[str] = "pcm_s16le"
         sample_rate: Optional[int] = 16000
         container: Optional[str] = "raw"
@@ -78,6 +77,7 @@ def __init__(
             voice_id: str,
             cartesia_version: str = "2024-06-10",
             url: str = "wss://api.cartesia.ai/tts/websocket",
+            model_id: str = "sonic-english",
             params: InputParams = InputParams(),
             **kwargs):
         # Aggregating sentences still gives cleaner-sounding results and fewer
@@ -96,7 +96,7 @@ def __init__(
         self._cartesia_version = cartesia_version
         self._url = url
         self._voice_id = voice_id
-        self._model_id = params.model_id
+        self._model_id = model_id
         self._output_format = {
             "container": params.container,
             "encoding": params.encoding,

From 75008d8f115ccab562a9b564e7415bdcb024a850 Mon Sep 17 00:00:00 2001
From: Jin Kim <golbin@gmail.com>
Date: Wed, 18 Sep 2024 00:51:45 +0900
Subject: [PATCH 03/60] Add speed and emotion setting method to Cartesia TTS
 service

---
 src/pipecat/services/cartesia.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index 45e42470e..6cec9c06d 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -121,6 +121,14 @@ async def set_voice(self, voice: str):
         logger.debug(f"Switching TTS voice to: [{voice}]")
         self._voice_id = voice
 
+    async def set_speed(self, speed: str):
+        logger.debug(f"Switching TTS speed to: [{speed}]")
+        self._speed = speed
+
+    async def set_emotion(self, emotion: list[str]):
+        logger.debug(f"Switching TTS emotion to: [{emotion}]")
+        self._emotion = emotion
+
     async def set_language(self, language: Language):
         logger.debug(f"Switching TTS language to: [{language}]")
         self._language = language_to_cartesia_language(language)

From 4533ed014fc4a22d3f7123c8f2ab3425ff7afb2f Mon Sep 17 00:00:00 2001
From: duyalei <>
Date: Mon, 23 Sep 2024 16:34:31 +0800
Subject: [PATCH 04/60] add full-width punctuations as end of the sentence

---
 src/pipecat/utils/string.py |  3 ++-
 tests/test_ai_services.py   | 13 +++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/pipecat/utils/string.py b/src/pipecat/utils/string.py
index a47db6c5c..cf9a22ad8 100644
--- a/src/pipecat/utils/string.py
+++ b/src/pipecat/utils/string.py
@@ -14,7 +14,8 @@
     (?<!Mr|Ms|Dr)    # Negative lookbehind: not preceded by Mr, Ms, Dr (combined bc. length is the same)
     (?<!Mrs)         # Negative lookbehind: not preceded by "Mrs"
     (?<!Prof)        # Negative lookbehind: not preceded by "Prof"
-    [\.\?\!:]        # Match a period, question mark, exclamation point, or colon
+    [\.\?\!:;]|      # Match a period, question mark, exclamation point, colon, or semicolon
+    [。？！：；]       # the full-width version (mainly used in East Asian languages such as Chinese)
     $                # End of string
 """
 ENDOFSENTENCE_PATTERN = re.compile(ENDOFSENTENCE_PATTERN_STR, re.VERBOSE)
diff --git a/tests/test_ai_services.py b/tests/test_ai_services.py
index fb00fc893..f539a44ce 100644
--- a/tests/test_ai_services.py
+++ b/tests/test_ai_services.py
@@ -32,6 +32,7 @@ async def test_endofsentence(self):
         assert match_endofsentence("This is a sentence! ")
         assert match_endofsentence("This is a sentence?")
         assert match_endofsentence("This is a sentence:")
+        assert match_endofsentence("This is a sentence;")
         assert not match_endofsentence("This is not a sentence")
         assert not match_endofsentence("This is not a sentence,")
         assert not match_endofsentence("This is not a sentence, ")
@@ -43,6 +44,18 @@ async def test_endofsentence(self):
         assert not match_endofsentence("America, or the U.")  # U.S.A.
         assert not match_endofsentence("It still early, it's 3:00 a.")  # 3:00 a.m.
 
+    async def test_endofsentence_zh(self):
+        chinese_sentences = [
+            "你好。",
+            "你好！",
+            "吃了吗？",
+            "安全第一；",
+            "他说：",
+        ]
+        for i in chinese_sentences:
+            assert match_endofsentence(i)
+        assert not match_endofsentence("你好，")
+
 
 if __name__ == "__main__":
     unittest.main()

From 306632b29a0ee2e4ea2a38cf10c804b8d0545107 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Mon, 23 Sep 2024 08:37:29 -0400
Subject: [PATCH 05/60] Add language_code support for ElevenLabs TTS

---
 src/pipecat/services/elevenlabs.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py
index 00a32cbfd..79d90bc58 100644
--- a/src/pipecat/services/elevenlabs.py
+++ b/src/pipecat/services/elevenlabs.py
@@ -72,6 +72,7 @@ def calculate_word_times(
 
 class ElevenLabsTTSService(AsyncWordTTSService):
     class InputParams(BaseModel):
+        language_code: Optional[str] = None
         output_format: Literal["pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100"] = "pcm_16000"
         optimize_streaming_latency: Optional[str] = None
         stability: Optional[float] = None
@@ -228,6 +229,15 @@ async def _connect(self):
             if self._params.optimize_streaming_latency:
                 url += f"&optimize_streaming_latency={self._params.optimize_streaming_latency}"
 
+            # language_code can only be used with the 'eleven_turbo_v2_5' model
+            if self._params.language_code:
+                if model == "eleven_turbo_v2_5":
+                    url += f"&language_code={self._params.language_code}"
+                else:
+                    logger.debug(
+                        f"Language code [{self._params.language_code}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
+                    )
+
             self._websocket = await websockets.connect(url)
             self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
             self._keepalive_task = self.get_event_loop().create_task(self._keepalive_task_handler())

From 8edee8155dbf7d7a47ee491a3d1caafdcd3910d5 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Mon, 23 Sep 2024 10:07:05 -0400
Subject: [PATCH 06/60] Add input params to Azure TTS

---
 src/pipecat/services/azure.py | 115 ++++++++++++++++++++++------------
 1 file changed, 76 insertions(+), 39 deletions(-)

diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py
index 24e73cd2a..efb6a6fe4 100644
--- a/src/pipecat/services/azure.py
+++ b/src/pipecat/services/azure.py
@@ -4,45 +4,34 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-import aiohttp
 import asyncio
 import io
+from typing import AsyncGenerator, Optional
 
+import aiohttp
+from loguru import logger
 from PIL import Image
-from typing import AsyncGenerator
-
-from pipecat.frames.frames import (
-    CancelFrame,
-    EndFrame,
-    ErrorFrame,
-    Frame,
-    StartFrame,
-    TTSAudioRawFrame,
-    TTSStartedFrame,
-    TTSStoppedFrame,
-    TranscriptionFrame,
-    URLImageRawFrame,
-)
-from pipecat.metrics.metrics import TTSUsageMetricsData
-from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.ai_services import STTService, TTSService, ImageGenService
+from pydantic import BaseModel
+
+from pipecat.frames.frames import (CancelFrame, EndFrame, ErrorFrame, Frame,
+                                   StartFrame, TranscriptionFrame,
+                                   TTSAudioRawFrame, TTSStartedFrame,
+                                   TTSStoppedFrame, URLImageRawFrame)
+from pipecat.services.ai_services import (ImageGenService, STTService,
+                                          TTSService)
 from pipecat.services.openai import BaseOpenAILLMService
 from pipecat.utils.time import time_now_iso8601
 
-from loguru import logger
-
 # See .env.example for Azure configuration needed
 try:
-    from openai import AsyncAzureOpenAI
-    from azure.cognitiveservices.speech import (
-        SpeechConfig,
-        SpeechRecognizer,
-        SpeechSynthesizer,
-        ResultReason,
-        CancellationReason,
-    )
-    from azure.cognitiveservices.speech.audio import AudioStreamFormat, PushAudioInputStream
+    from azure.cognitiveservices.speech import (CancellationReason,
+                                                ResultReason, SpeechConfig,
+                                                SpeechRecognizer,
+                                                SpeechSynthesizer)
+    from azure.cognitiveservices.speech.audio import (AudioStreamFormat,
+                                                      PushAudioInputStream)
     from azure.cognitiveservices.speech.dialog import AudioConfig
+    from openai import AsyncAzureOpenAI
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
     logger.error(
@@ -70,6 +59,17 @@ def create_client(self, api_key=None, base_url=None, **kwargs):
 
 
 class AzureTTSService(TTSService):
+    class InputParams(BaseModel):
+        emphasis: Optional[str] = None
+        language_code: Optional[str] = "en-US"
+        pitch: Optional[str] = None
+        rate: Optional[str] = "1.05"
+        role: Optional[str] = None
+        style: Optional[str] = None
+        style_degree: Optional[str] = None
+        volume: Optional[str] = None
+
+
     def __init__(
         self,
         *,
@@ -77,6 +77,7 @@ def __init__(
         region: str,
         voice="en-US-SaraNeural",
         sample_rate: int = 16000,
+        params: InputParams = InputParams(),
         **kwargs,
     ):
         super().__init__(sample_rate=sample_rate, **kwargs)
@@ -86,10 +87,55 @@ def __init__(
 
         self._voice = voice
         self._sample_rate = sample_rate
+        self._params = params
 
     def can_generate_metrics(self) -> bool:
         return True
 
+    def _construct_ssml(self, text: str) -> str:
+        ssml = (
+            f"<speak version='1.0' xml:lang='{self._params.language_code}' "
+            "xmlns='http://www.w3.org/2001/10/synthesis' "
+            "xmlns:mstts='http://www.w3.org/2001/mstts'>"
+            f"<voice name='{self._voice}'>"
+            "<mstts:silence type='Sentenceboundary' value='20ms' />"
+        )
+
+        if self._params.style:
+            ssml += f"<mstts:express-as style='{self._params.style}'"
+            if self._params.style_degree:
+                ssml += f" styledegree='{self._params.style_degree}'"
+            if self._params.role:
+                ssml += f" role='{self._params.role}'"
+            ssml += ">"
+
+        prosody_attrs = []
+        if self._params.rate:
+            prosody_attrs.append(f"rate='{self._params.rate}'")
+        if self._params.pitch:
+            prosody_attrs.append(f"pitch='{self._params.pitch}'")
+        if self._params.volume:
+            prosody_attrs.append(f"volume='{self._params.volume}'")
+        
+        ssml += f"<prosody {' '.join(prosody_attrs)}>"
+
+        if self._params.emphasis:
+            ssml += f"<emphasis level='{self._params.emphasis}'>"
+
+        ssml += text
+
+        if self._params.emphasis:
+            ssml += "</emphasis>"
+
+        ssml += "</prosody>"
+
+        if self._params.style:
+            ssml += "</mstts:express-as>"
+
+        ssml += "</voice></speak>"
+
+        return ssml
+
     async def set_voice(self, voice: str):
         logger.debug(f"Switching TTS voice to: [{voice}]")
         self._voice = voice
@@ -99,16 +145,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
 
         await self.start_ttfb_metrics()
 
-        ssml = (
-            "<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' "
-            "xmlns:mstts='http://www.w3.org/2001/mstts'>"
-            f"<voice name='{self._voice}'>"
-            "<mstts:silence type='Sentenceboundary' value='20ms' />"
-            "<mstts:express-as style='lyrical' styledegree='2' role='SeniorFemale'>"
-            "<prosody rate='1.05'>"
-            f"{text}"
-            "</prosody></mstts:express-as></voice></speak> "
-        )
+        ssml = self._construct_ssml(text)
 
         result = await asyncio.to_thread(self._speech_synthesizer.speak_ssml, (ssml))
 

From 49f212389305465791eab139083b2e3286a05dee Mon Sep 17 00:00:00 2001
From: Jin Kim <golbin@gmail.com>
Date: Tue, 24 Sep 2024 07:59:26 +0900
Subject: [PATCH 07/60] Apply and Fix upstream changes for Cartesia

---
 src/pipecat/services/cartesia.py | 113 +++++++++++++++++++------------
 1 file changed, 69 insertions(+), 44 deletions(-)

diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index 40475343c..f08c06dea 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -9,7 +9,7 @@
 import base64
 import asyncio
 
-from typing import AsyncGenerator, Optional
+from typing import AsyncGenerator, Optional, Union, List
 from pydantic.main import BaseModel
 
 from pipecat.frames.frames import (
@@ -67,8 +67,8 @@ class InputParams(BaseModel):
         sample_rate: Optional[int] = 16000
         container: Optional[str] = "raw"
         language: Optional[str] = "en"
-        speed: Optional[str] = None
-        emotion: Optional[list[str]] = []
+        speed: Optional[Union[str, float]] = ""
+        emotion: Optional[List[str]] = []
 
     def __init__(
             self,
@@ -91,13 +91,14 @@ def __init__(
         # can use those to generate text frames ourselves aligned with the
         # playout timing of the audio!
         super().__init__(
-            aggregate_sentences=True, push_text_frames=False, sample_rate=sample_rate, **kwargs
+            aggregate_sentences=True, push_text_frames=False, sample_rate=params.sample_rate, **kwargs
         )
 
         self._api_key = api_key
         self._cartesia_version = cartesia_version
         self._url = url
         self._voice_id = voice_id
+        self._model_id = model_id
         self.set_model_name(model_id)
         self._output_format = {
             "container": params.container,
@@ -116,6 +117,7 @@ def can_generate_metrics(self) -> bool:
         return True
 
     async def set_model(self, model: str):
+        self._model_id = model
         await super().set_model(model)
         logger.debug(f"Switching TTS model to: [{model}]")
 
@@ -135,6 +137,31 @@ async def set_language(self, language: Language):
         logger.debug(f"Switching TTS language to: [{language}]")
         self._language = language_to_cartesia_language(language)
 
+    def _build_msg(self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True):
+        voice_config = {
+            "mode": "id",
+            "id": self._voice_id
+        }
+
+        if self._speed or self._emotion:
+            voice_config["__experimental_controls"] = {}
+            if self._speed:
+                voice_config["__experimental_controls"]["speed"] = self._speed
+            if self._emotion:
+                voice_config["__experimental_controls"]["emotion"] = self._emotion
+
+        msg = {
+            "transcript": text,
+            "continue": continue_transcript,
+            "context_id": self._context_id,
+            "model_id": self._model_name,
+            "voice": voice_config,
+            "output_format": self._output_format,
+            "language": self._language,
+            "add_timestamps": add_timestamps,
+        }
+        return json.dumps(msg)
+
     async def start(self, frame: StartFrame):
         await super().start(frame)
         await self._connect()
@@ -190,17 +217,8 @@ async def flush_audio(self):
         if not self._context_id or not self._websocket:
             return
         logger.trace("Flushing audio")
-        msg = {
-            "transcript": "",
-            "continue": False,
-            "context_id": self._context_id,
-            "model_id": self.model_name,
-            "voice": {"mode": "id", "id": self._voice_id},
-            "output_format": self._output_format,
-            "language": self._language,
-            "add_timestamps": True,
-        }
-        await self._websocket.send(json.dumps(msg))
+        msg = self._build_msg(text="", continue_transcript=False)
+        await self._websocket.send(msg)
 
     async def _receive_task_handler(self):
         try:
@@ -255,30 +273,10 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 await self.start_ttfb_metrics()
                 self._context_id = str(uuid.uuid4())
 
-            voice_config = {
-                "mode": "id",
-                "id": self._voice_id
-            }
+            msg = self._build_msg(text=text)
 
-            if self._speed or self._emotion:
-                voice_config["__experimental_controls"] = {}
-                if self._speed:
-                    voice_config["__experimental_controls"]["speed"] = self._speed
-                if self._emotion:
-                    voice_config["__experimental_controls"]["emotion"] = self._emotion
-
-            msg = {
-                "transcript": text + " ",
-                "continue": True,
-                "context_id": self._context_id,
-                "model_id": self._model_id,
-                "voice": voice_config,
-                "output_format": self._output_format,
-                "language": self._language,
-                "add_timestamps": True,
-            }
             try:
-                await self._get_websocket().send(json.dumps(msg))
+                await self._get_websocket().send(msg)
                 await self.start_tts_usage_metrics(text)
             except Exception as e:
                 logger.error(f"{self} error sending message: {e}")
@@ -292,6 +290,14 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
 
 
 class CartesiaHttpTTSService(TTSService):
+    class InputParams(BaseModel):
+        encoding: Optional[str] = "pcm_s16le"
+        sample_rate: Optional[int] = 16000
+        container: Optional[str] = "raw"
+        language: Optional[str] = "en"
+        speed: Optional[Union[str, float]] = ""
+        emotion: Optional[List[str]] = []
+
     def __init__(
         self,
         *,
@@ -299,9 +305,7 @@ def __init__(
         voice_id: str,
         model_id: str = "sonic-english",
         base_url: str = "https://api.cartesia.ai",
-        encoding: str = "pcm_s16le",
-        sample_rate: int = 16000,
-        language: str = "en",
+        params: InputParams = InputParams(),
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -309,12 +313,15 @@ def __init__(
         self._api_key = api_key
         self._voice_id = voice_id
         self._model_id = model_id
+        self.set_model_name(model_id)
         self._output_format = {
-            "container": "raw",
-            "encoding": encoding,
-            "sample_rate": sample_rate,
+            "container": params.container,
+            "encoding": params.encoding,
+            "sample_rate": params.sample_rate,
         }
-        self._language = language
+        self._language = params.language
+        self._speed = params.speed
+        self._emotion = params.emotion
 
         self._client = AsyncCartesia(api_key=api_key, base_url=base_url)
 
@@ -324,11 +331,20 @@ def can_generate_metrics(self) -> bool:
     async def set_model(self, model: str):
         logger.debug(f"Switching TTS model to: [{model}]")
         self._model_id = model
+        await super().set_model(model)
 
     async def set_voice(self, voice: str):
         logger.debug(f"Switching TTS voice to: [{voice}]")
         self._voice_id = voice
 
+    async def set_speed(self, speed: str):
+        logger.debug(f"Switching TTS speed to: [{speed}]")
+        self._speed = speed
+
+    async def set_emotion(self, emotion: list[str]):
+        logger.debug(f"Switching TTS emotion to: [{emotion}]")
+        self._emotion = emotion
+
     async def set_language(self, language: Language):
         logger.debug(f"Switching TTS language to: [{language}]")
         self._language = language_to_cartesia_language(language)
@@ -348,6 +364,14 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         await self.start_ttfb_metrics()
 
         try:
+            voice_controls = None
+            if self._speed or self._emotion:
+                voice_controls = {}
+                if self._speed:
+                    voice_controls["speed"] = self._speed
+                if self._emotion:
+                    voice_controls["emotion"] = self._emotion
+
             output = await self._client.tts.sse(
                 model_id=self._model_id,
                 transcript=text,
@@ -355,6 +379,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 output_format=self._output_format,
                 language=self._language,
                 stream=False,
+                _experimental_voice_controls=voice_controls
             )
 
             await self.stop_ttfb_metrics()

From 8ee9621d6614e8f5c5d9e0252411f5b9c3569149 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Mon, 23 Sep 2024 10:49:50 -0400
Subject: [PATCH 08/60] Add setter functions

---
 src/pipecat/services/azure.py | 89 +++++++++++++++++++++++++++++------
 1 file changed, 75 insertions(+), 14 deletions(-)

diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py
index efb6a6fe4..41fc7598b 100644
--- a/src/pipecat/services/azure.py
+++ b/src/pipecat/services/azure.py
@@ -13,23 +13,32 @@
 from PIL import Image
 from pydantic import BaseModel
 
-from pipecat.frames.frames import (CancelFrame, EndFrame, ErrorFrame, Frame,
-                                   StartFrame, TranscriptionFrame,
-                                   TTSAudioRawFrame, TTSStartedFrame,
-                                   TTSStoppedFrame, URLImageRawFrame)
-from pipecat.services.ai_services import (ImageGenService, STTService,
-                                          TTSService)
+from pipecat.frames.frames import (
+    CancelFrame,
+    EndFrame,
+    ErrorFrame,
+    Frame,
+    StartFrame,
+    TranscriptionFrame,
+    TTSAudioRawFrame,
+    TTSStartedFrame,
+    TTSStoppedFrame,
+    URLImageRawFrame,
+)
+from pipecat.services.ai_services import ImageGenService, STTService, TTSService
 from pipecat.services.openai import BaseOpenAILLMService
 from pipecat.utils.time import time_now_iso8601
 
 # See .env.example for Azure configuration needed
 try:
-    from azure.cognitiveservices.speech import (CancellationReason,
-                                                ResultReason, SpeechConfig,
-                                                SpeechRecognizer,
-                                                SpeechSynthesizer)
-    from azure.cognitiveservices.speech.audio import (AudioStreamFormat,
-                                                      PushAudioInputStream)
+    from azure.cognitiveservices.speech import (
+        CancellationReason,
+        ResultReason,
+        SpeechConfig,
+        SpeechRecognizer,
+        SpeechSynthesizer,
+    )
+    from azure.cognitiveservices.speech.audio import AudioStreamFormat, PushAudioInputStream
     from azure.cognitiveservices.speech.dialog import AudioConfig
     from openai import AsyncAzureOpenAI
 except ModuleNotFoundError as e:
@@ -69,7 +78,6 @@ class InputParams(BaseModel):
         style_degree: Optional[str] = None
         volume: Optional[str] = None
 
-
     def __init__(
         self,
         *,
@@ -116,7 +124,7 @@ def _construct_ssml(self, text: str) -> str:
             prosody_attrs.append(f"pitch='{self._params.pitch}'")
         if self._params.volume:
             prosody_attrs.append(f"volume='{self._params.volume}'")
-        
+
         ssml += f"<prosody {' '.join(prosody_attrs)}>"
 
         if self._params.emphasis:
@@ -140,6 +148,59 @@ async def set_voice(self, voice: str):
         logger.debug(f"Switching TTS voice to: [{voice}]")
         self._voice = voice
 
+    async def set_emphasis(self, emphasis: str):
+        logger.debug(f"Setting TTS emphasis to: [{emphasis}]")
+        self._params.emphasis = emphasis
+
+    async def set_language_code(self, language_code: str):
+        logger.debug(f"Setting TTS language code to: [{language_code}]")
+        self._params.language_code = language_code
+
+    async def set_pitch(self, pitch: str):
+        logger.debug(f"Setting TTS pitch to: [{pitch}]")
+        self._params.pitch = pitch
+
+    async def set_rate(self, rate: str):
+        logger.debug(f"Setting TTS rate to: [{rate}]")
+        self._params.rate = rate
+
+    async def set_role(self, role: str):
+        logger.debug(f"Setting TTS role to: [{role}]")
+        self._params.role = role
+
+    async def set_style(self, style: str):
+        logger.debug(f"Setting TTS style to: [{style}]")
+        self._params.style = style
+
+    async def set_style_degree(self, style_degree: str):
+        logger.debug(f"Setting TTS style degree to: [{style_degree}]")
+        self._params.style_degree = style_degree
+
+    async def set_volume(self, volume: str):
+        logger.debug(f"Setting TTS volume to: [{volume}]")
+        self._params.volume = volume
+
+    async def set_params(self, **kwargs):
+        valid_params = {
+            "voice": self.set_voice,
+            "emphasis": self.set_emphasis,
+            "language_code": self.set_language_code,
+            "pitch": self.set_pitch,
+            "rate": self.set_rate,
+            "role": self.set_role,
+            "style": self.set_style,
+            "style_degree": self.set_style_degree,
+            "volume": self.set_volume,
+        }
+
+        for param, value in kwargs.items():
+            if param in valid_params:
+                await valid_params[param](value)
+            else:
+                logger.warning(f"Ignoring unknown parameter: {param}")
+
+        logger.debug(f"Updated TTS parameters: {', '.join(kwargs.keys())}")
+
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         logger.debug(f"Generating TTS: [{text}]")
 

From b1e17ee34792326350c4040216bff69a50f3c4b8 Mon Sep 17 00:00:00 2001
From: mercuryyy <mercuryyy@users.noreply.github.com>
Date: Tue, 24 Sep 2024 07:45:29 -0400
Subject: [PATCH 09/60] Fix syntax error in deepgram.py

---
 src/pipecat/services/deepgram.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py
index fab12e080..914bc2ec2 100644
--- a/src/pipecat/services/deepgram.py
+++ b/src/pipecat/services/deepgram.py
@@ -77,8 +77,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         logger.debug(f"Generating TTS: [{text}]")
 
         base_url = self._base_url
-        request_url = f"{base_url}?model={self._voice}&encoding={
-            self._encoding}&container=none&sample_rate={self._sample_rate}"
+        request_url = f"{base_url}?model={self._voice}&encoding={self._encoding}&container=none&sample_rate={self._sample_rate}"
         headers = {"authorization": f"token {self._api_key}"}
         body = {"text": text}
 

From cb49b6a0d678662cdc2483b2497e00fbf0af28bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Mon, 23 Sep 2024 23:39:05 -0700
Subject: [PATCH 10/60] rtvi: add llm-text and tts-text server messages

---
 src/pipecat/processors/frameworks/rtvi.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
index 820ea716c..7a35c5c06 100644
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -242,6 +242,22 @@ class RTVILLMFunctionCallResultData(BaseModel):
     result: dict | str
 
 
+class RTVITextMessageData(BaseModel):
+    text: str
+
+
+class RTVILLMTextMessage(BaseModel):
+    label: Literal["rtvi-ai"] = "rtvi-ai"
+    type: Literal["llm-text"] = "llm-text"
+    data: RTVITextMessageData
+
+
+class RTVITTSTextMessage(BaseModel):
+    label: Literal["rtvi-ai"] = "rtvi-ai"
+    type: Literal["tts-text"] = "tts-text"
+    data: RTVITextMessageData
+
+
 class RTVITranscriptionMessageData(BaseModel):
     text: str
     user_id: str

From 08ac311971afb82fff28efa58af81814d9c052c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 24 Sep 2024 09:35:36 -0700
Subject: [PATCH 11/60] rtvi: use task to process incoming action frames

---
 src/pipecat/processors/frameworks/rtvi.py | 24 ++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
index 7a35c5c06..bb23c9856 100644
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -316,6 +316,11 @@ def __init__(
         self._registered_actions: Dict[str, RTVIAction] = {}
         self._registered_services: Dict[str, RTVIService] = {}
 
+        # A task to process incoming action frames.
+        self._action_task = self.get_event_loop().create_task(self._action_task_handler())
+        self._action_queue = asyncio.Queue()
+
+        # A task to process incoming transport messages.
         self._message_task = self.get_event_loop().create_task(self._message_task_handler())
         self._message_queue = asyncio.Queue()
 
@@ -401,7 +406,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
         elif isinstance(frame, TransportMessageFrame):
             await self._message_queue.put(frame)
         elif isinstance(frame, RTVIActionFrame):
-            await self._handle_action(frame.message_id, frame.rtvi_action_run)
+            await self._action_queue.put(frame)
         # Other frames
         else:
             await self.push_frame(frame, direction)
@@ -415,12 +420,16 @@ async def _start(self, frame: StartFrame):
         await self._maybe_send_bot_ready()
 
     async def _stop(self, frame: EndFrame):
-        # We need to cancel the message task handler because that one is not
-        # processing EndFrames.
+        self._action_task.cancel()
+        await self._action_task
+
         self._message_task.cancel()
         await self._message_task
 
     async def _cancel(self, frame: CancelFrame):
+        self._action_task.cancel()
+        await self._action_task
+
         self._message_task.cancel()
         await self._message_task
 
@@ -471,6 +480,15 @@ async def _handle_bot_speaking(self, frame: Frame):
         if message:
             await self._push_transport_message(message)
 
+    async def _action_task_handler(self):
+        while True:
+            try:
+                frame = await self._action_queue.get()
+                await self._handle_action(frame.message_id, frame.rtvi_action_run)
+                self._action_queue.task_done()
+            except asyncio.CancelledError:
+                break
+
     async def _message_task_handler(self):
         while True:
             try:

From a483f1a083d7d97e932c680fa542c78926dba1a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 24 Sep 2024 10:48:15 -0700
Subject: [PATCH 12/60] rtvi: handle all actions from the action task

---
 src/pipecat/processors/frameworks/rtvi.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
index bb23c9856..49095b2e4 100644
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -521,7 +521,8 @@ async def _handle_message(self, frame: TransportMessageFrame):
                     await self._handle_update_config(message.id, update_config)
                 case "action":
                     action = RTVIActionRun.model_validate(message.data)
-                    await self._handle_action(message.id, action)
+                    action_frame = RTVIActionFrame(message_id=message.id, rtvi_action_run=action)
+                    await self._action_queue.put(action_frame)
                 case "llm-function-call-result":
                     data = RTVILLMFunctionCallResultData.model_validate(message.data)
                     await self._handle_function_call_result(data)

From 31b5667cee272651478aebc13e5ca4f5ab1bd2c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 24 Sep 2024 13:10:40 -0700
Subject: [PATCH 13/60] frames: log text with [] so we can distinguish spaces
 better

---
 src/pipecat/frames/frames.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index e4495098b..768e53e39 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -132,9 +132,7 @@ class VisionImageRawFrame(InputImageRawFrame):
 
     def __str__(self):
         pts = format_pts(self.pts)
-        return (
-            f"{self.name}(pts: {pts}, text: {self.text}, size: {self.size}, format: {self.format})"
-        )
+        return f"{self.name}(pts: {pts}, text: [{self.text}], size: {self.size}, format: {self.format})"
 
 
 @dataclass
@@ -177,7 +175,7 @@ class TextFrame(DataFrame):
 
     def __str__(self):
         pts = format_pts(self.pts)
-        return f"{self.name}(pts: {pts}, text: {self.text})"
+        return f"{self.name}(pts: {pts}, text: [{self.text}])"
 
 
 @dataclass
@@ -192,7 +190,7 @@ class TranscriptionFrame(TextFrame):
     language: Language | None = None
 
     def __str__(self):
-        return f"{self.name}(user: {self.user_id}, text: {self.text}, language: {self.language}, timestamp: {self.timestamp})"
+        return f"{self.name}(user: {self.user_id}, text: [{self.text}], language: {self.language}, timestamp: {self.timestamp})"
 
 
 @dataclass
@@ -205,7 +203,7 @@ class InterimTranscriptionFrame(TextFrame):
     language: Language | None = None
 
     def __str__(self):
-        return f"{self.name}(user: {self.user_id}, text: {self.text}, language: {self.language}, timestamp: {self.timestamp})"
+        return f"{self.name}(user: {self.user_id}, text: [{self.text}], language: {self.language}, timestamp: {self.timestamp})"
 
 
 @dataclass

From ee3786fe155a426843707d465b302f3419d569f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 24 Sep 2024 19:10:22 -0700
Subject: [PATCH 14/60] frames: add EndTaskFrame and CancelTaskFrame

---
 CHANGELOG.md                 |  4 ++++
 src/pipecat/frames/frames.py | 21 +++++++++++++++++++++
 src/pipecat/pipeline/task.py | 16 ++++++++++++++--
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c1d571da5..fb08aab2e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added `EndTaskFrame` and `CancelTaskFrame`. These are new frames that are
+  meant to be pushed upstream to tell the pipeline task to stop nicely or
+  immediately respectively.
+
 - Added configurable LLM parameters (e.g., temperature, top_p, max_tokens, seed)
   for OpenAI, Anthropic, and Together AI services along with corresponding
   setter functions.
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index 768e53e39..273aad214 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -339,6 +339,27 @@ class FatalErrorFrame(ErrorFrame):
     fatal: bool = field(default=True, init=False)
 
 
+@dataclass
+class EndTaskFrame(SystemFrame):
+    """This is used to notify the pipeline task that the pipeline should be
+    closed nicely (flushing all the queued frames) by pushing an EndFrame
+    downstream.
+
+    """
+
+    pass
+
+
+@dataclass
+class CancelTaskFrame(SystemFrame):
+    """This is used to notify the pipeline task that the pipeline should be
+    stopped immediately by pushing a CancelFrame downstream.
+
+    """
+
+    pass
+
+
 @dataclass
 class StopTaskFrame(SystemFrame):
     """Indicates that a pipeline task should be stopped but that the pipeline
diff --git a/src/pipecat/pipeline/task.py b/src/pipecat/pipeline/task.py
index 2b46c47c2..f79ff6f39 100644
--- a/src/pipecat/pipeline/task.py
+++ b/src/pipecat/pipeline/task.py
@@ -14,7 +14,9 @@
 from pipecat.clocks.system_clock import SystemClock
 from pipecat.frames.frames import (
     CancelFrame,
+    CancelTaskFrame,
     EndFrame,
+    EndTaskFrame,
     ErrorFrame,
     Frame,
     MetricsFrame,
@@ -52,7 +54,13 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
                 await self.push_frame(frame, direction)
 
     async def _handle_upstream_frame(self, frame: Frame):
-        if isinstance(frame, ErrorFrame):
+        if isinstance(frame, EndTaskFrame):
+            # Tell the task we should end nicely.
+            await self._up_queue.put(EndTaskFrame())
+        elif isinstance(frame, CancelTaskFrame):
+            # Tell the task we should end right away.
+            await self._up_queue.put(CancelTaskFrame())
+        elif isinstance(frame, ErrorFrame):
             logger.error(f"Error running app: {frame}")
             if frame.fatal:
                 # Cancel all tasks downstream.
@@ -165,7 +173,11 @@ async def _process_up_queue(self):
         while True:
             try:
                 frame = await self._up_queue.get()
-                if isinstance(frame, StopTaskFrame):
+                if isinstance(frame, EndTaskFrame):
+                    await self.queue_frame(EndFrame())
+                elif isinstance(frame, CancelTaskFrame):
+                    await self.queue_frame(CancelFrame())
+                elif isinstance(frame, StopTaskFrame):
                     await self.queue_frame(StopTaskFrame())
                 self._up_queue.task_done()
             except asyncio.CancelledError:

From 1a3de0e8191a7529a474ec17f49b906d78c82623 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 24 Sep 2024 19:12:06 -0700
Subject: [PATCH 15/60] rtvi: add RTVIProcessor.handle_message()

---
 src/pipecat/processors/frameworks/rtvi.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
index 49095b2e4..b75b65627 100644
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -343,6 +343,9 @@ async def set_client_ready(self):
             self._client_ready = True
             await self._maybe_send_bot_ready()
 
+    async def handle_message(self, message: RTVIMessage):
+        await self._message_queue.put(message)
+
     async def handle_function_call(
         self,
         function_name: str,
@@ -492,20 +495,21 @@ async def _action_task_handler(self):
     async def _message_task_handler(self):
         while True:
             try:
-                frame = await self._message_queue.get()
-                await self._handle_message(frame)
+                message = await self._message_queue.get()
+                await self._handle_message(message)
                 self._message_queue.task_done()
             except asyncio.CancelledError:
                 break
 
-    async def _handle_message(self, frame: TransportMessageFrame):
+    async def _handle_transport_message(self, frame: TransportMessageFrame):
         try:
             message = RTVIMessage.model_validate(frame.message)
+            await self._message_queue.put(message)
         except ValidationError as e:
-            await self.send_error(f"Invalid incoming message: {e}")
-            logger.warning(f"Invalid incoming  message: {e}")
-            return
+            await self.send_error(f"Invalid RTVI transport message: {e}")
+            logger.warning(f"Invalid RTVI transport message: {e}")
 
+    async def _handle_message(self, message: RTVIMessage):
         try:
             match message.type:
                 case "client-ready":
@@ -531,8 +535,8 @@ async def _handle_message(self, frame: TransportMessageFrame):
                     await self._send_error_response(message.id, f"Unsupported type {message.type}")
 
         except ValidationError as e:
-            await self._send_error_response(message.id, f"Invalid incoming message: {e}")
-            logger.warning(f"Invalid incoming  message: {e}")
+            await self._send_error_response(message.id, f"Invalid message: {e}")
+            logger.warning(f"Invalid message: {e}")
         except Exception as e:
             await self._send_error_response(message.id, f"Exception processing message: {e}")
             logger.warning(f"Exception processing message: {e}")

From e276dcbab78907dac1df1e5ab1176da74255b4bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 24 Sep 2024 19:19:00 -0700
Subject: [PATCH 16/60] initialize task variables and add minor description

---
 src/pipecat/processors/frame_processor.py |  1 -
 src/pipecat/processors/frameworks/rtvi.py | 26 ++++++----
 src/pipecat/services/ai_services.py       |  1 +
 src/pipecat/transports/base_input.py      | 13 +++--
 src/pipecat/transports/base_output.py     | 59 +++++++++++++++++------
 src/pipecat/transports/services/daily.py  |  5 ++
 6 files changed, 77 insertions(+), 28 deletions(-)

diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py
index 1bf42311d..b56846aa6 100644
--- a/src/pipecat/processors/frame_processor.py
+++ b/src/pipecat/processors/frame_processor.py
@@ -5,7 +5,6 @@
 #
 
 import asyncio
-import time
 
 from enum import Enum
 
diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
index 49095b2e4..994be3fad 100644
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -420,18 +420,26 @@ async def _start(self, frame: StartFrame):
         await self._maybe_send_bot_ready()
 
     async def _stop(self, frame: EndFrame):
-        self._action_task.cancel()
-        await self._action_task
+        if self._action_task:
+            self._action_task.cancel()
+            await self._action_task
+            self._action_task = None
 
-        self._message_task.cancel()
-        await self._message_task
+        if self._message_task:
+            self._message_task.cancel()
+            await self._message_task
+            self._message_task = None
 
     async def _cancel(self, frame: CancelFrame):
-        self._action_task.cancel()
-        await self._action_task
-
-        self._message_task.cancel()
-        await self._message_task
+        if self._action_task:
+            self._action_task.cancel()
+            await self._action_task
+            self._action_task = None
+
+        if self._message_task:
+            self._message_task.cancel()
+            await self._message_task
+            self._message_task = None
 
     async def _push_transport_message(self, model: BaseModel, exclude_none: bool = True):
         frame = TransportMessageFrame(
diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index cdad3de52..197067adc 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -350,6 +350,7 @@ async def _stop_words_task(self):
         if self._words_task:
             self._words_task.cancel()
             await self._words_task
+            self._words_task = None
 
     async def _words_task_handler(self):
         while True:
diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py
index 73ad3f5e3..df7babff1 100644
--- a/src/pipecat/transports/base_input.py
+++ b/src/pipecat/transports/base_input.py
@@ -37,6 +37,10 @@ def __init__(self, params: TransportParams, **kwargs):
 
         self._executor = ThreadPoolExecutor(max_workers=5)
 
+        # Task to process incoming audio (VAD) and push audio frames downstream
+        # if passthrough is enabled.
+        self._audio_task = None
+
     async def start(self, frame: StartFrame):
         # Create audio input queue and task if needed.
         if self._params.audio_in_enabled or self._params.vad_enabled:
@@ -45,16 +49,17 @@ async def start(self, frame: StartFrame):
 
     async def stop(self, frame: EndFrame):
         # Cancel and wait for the audio input task to finish.
-        if self._params.audio_in_enabled or self._params.vad_enabled:
+        if self._audio_task and (self._params.audio_in_enabled or self._params.vad_enabled):
             self._audio_task.cancel()
             await self._audio_task
+            self._audio_task = None
 
     async def cancel(self, frame: CancelFrame):
-        # Cancel all the tasks and wait for them to finish.
-
-        if self._params.audio_in_enabled or self._params.vad_enabled:
+        # Cancel and wait for the audio input task to finish.
+        if self._audio_task and (self._params.audio_in_enabled or self._params.vad_enabled):
             self._audio_task.cancel()
             await self._audio_task
+            self._audio_task = None
 
     def vad_analyzer(self) -> VADAnalyzer | None:
         return self._params.vad_analyzer
diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py
index 5423b122f..941a3505a 100644
--- a/src/pipecat/transports/base_output.py
+++ b/src/pipecat/transports/base_output.py
@@ -47,6 +47,18 @@ def __init__(self, params: TransportParams, **kwargs):
 
         self._params = params
 
+        # Task to process incoming frames so we don't block upstream elements.
+        self._sink_task = None
+
+        # Task to process incoming frames using a clock.
+        self._sink_clock_task = None
+
+        # Task to write/send audio frames.
+        self._audio_out_task = None
+
+        # Task to write/send image frames.
+        self._camera_out_task = None
+
         # These are the images that we should send to the camera at our desired
         # framerate.
         self._camera_images = None
@@ -88,36 +100,53 @@ async def stop(self, frame: EndFrame):
         # that EndFrame to be processed by the sink tasks. We also need to wait
         # for these tasks before cancelling the camera and audio tasks below
         # because they might be still rendering.
-        await self._sink_task
-        await self._sink_clock_task
+        if self._sink_task:
+            await self._sink_task
+        if self._sink_clock_task:
+            await self._sink_clock_task
 
         # Cancel and wait for the camera output task to finish.
-        if self._params.camera_out_enabled:
+        if self._camera_out_task and self._params.camera_out_enabled:
             self._camera_out_task.cancel()
             await self._camera_out_task
+            self._camera_out_task = None
 
         # Cancel and wait for the audio output task to finish.
-        if self._params.audio_out_enabled and self._params.audio_out_is_live:
+        if (
+            self._audio_out_task
+            and self._params.audio_out_enabled
+            and self._params.audio_out_is_live
+        ):
             self._audio_out_task.cancel()
             await self._audio_out_task
+            self._audio_out_task = None
 
     async def cancel(self, frame: CancelFrame):
         # Since we are cancelling everything it doesn't matter if we cancel sink
         # tasks first or not.
-        self._sink_task.cancel()
-        self._sink_clock_task.cancel()
-        await self._sink_task
-        await self._sink_clock_task
+        if self._sink_task:
+            self._sink_task.cancel()
+            await self._sink_task
+            self._sink_task = None
+
+        if self._sink_clock_task:
+            self._sink_clock_task.cancel()
+            await self._sink_clock_task
+            self._sink_clock_task = None
 
         # Cancel and wait for the camera output task to finish.
-        if self._params.camera_out_enabled:
+        if self._camera_out_task and self._params.camera_out_enabled:
             self._camera_out_task.cancel()
             await self._camera_out_task
+            self._camera_out_task = None
 
         # Cancel and wait for the audio output task to finish.
-        if self._params.audio_out_enabled and self._params.audio_out_is_live:
+        if self._audio_out_task and (
+            self._params.audio_out_enabled and self._params.audio_out_is_live
+        ):
             self._audio_out_task.cancel()
             await self._audio_out_task
+            self._audio_out_task = None
 
     async def send_message(self, frame: TransportMessageFrame):
         pass
@@ -183,11 +212,13 @@ async def _handle_interruptions(self, frame: Frame):
 
         if isinstance(frame, StartInterruptionFrame):
             # Stop sink tasks.
-            self._sink_task.cancel()
-            await self._sink_task
+            if self._sink_task:
+                self._sink_task.cancel()
+                await self._sink_task
             # Stop sink clock tasks.
-            self._sink_clock_task.cancel()
-            await self._sink_clock_task
+            if self._sink_clock_task:
+                self._sink_clock_task.cancel()
+                await self._sink_clock_task
             # Create sink tasks.
             self._create_sink_tasks()
             # Let's send a bot stopped speaking if we have to.
diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py
index 48b59d8ff..50c2ae085 100644
--- a/src/pipecat/transports/services/daily.py
+++ b/src/pipecat/transports/services/daily.py
@@ -575,6 +575,9 @@ def __init__(self, client: DailyTransportClient, params: DailyParams, **kwargs):
         self._client = client
 
         self._video_renderers = {}
+
+        # Task that gets audio data from a device or the network and queues it
+        # internally to be processed.
         self._audio_in_task = None
 
         self._vad_analyzer: VADAnalyzer | None = params.vad_analyzer
@@ -603,6 +606,7 @@ async def stop(self, frame: EndFrame):
         if self._audio_in_task and (self._params.audio_in_enabled or self._params.vad_enabled):
             self._audio_in_task.cancel()
             await self._audio_in_task
+            self._audio_in_task = None
 
     async def cancel(self, frame: CancelFrame):
         # Parent stop.
@@ -613,6 +617,7 @@ async def cancel(self, frame: CancelFrame):
         if self._audio_in_task and (self._params.audio_in_enabled or self._params.vad_enabled):
             self._audio_in_task.cancel()
             await self._audio_in_task
+            self._audio_in_task = None
 
     async def cleanup(self):
         await super().cleanup()

From 9461bacf0d6793661b20d7cc5367ad083188a433 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 24 Sep 2024 19:24:37 -0700
Subject: [PATCH 17/60] pyproject: update fastapi to 0.115.0

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index aebccda2f..46345ed71 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,7 +56,7 @@ openpipe = [ "openpipe~=4.24.0" ]
 playht = [ "pyht~=0.0.28" ]
 silero = [ "onnxruntime>=1.16.1" ]
 together = [ "together~=1.2.7" ]
-websocket = [ "websockets~=12.0", "fastapi~=0.112.1" ]
+websocket = [ "websockets~=12.0", "fastapi~=0.115.0" ]
 whisper = [ "faster-whisper~=1.0.3" ]
 xtts = [ "resampy~=0.4.3" ]
 

From b8713666c23d19989abdfd6512838e0cefd2df0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 24 Sep 2024 19:35:05 -0700
Subject: [PATCH 18/60] processors: add AsyncGeneratorProcessor

---
 CHANGELOG.md                              |  4 +++
 src/pipecat/processors/async_generator.py | 42 +++++++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 src/pipecat/processors/async_generator.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fb08aab2e..6c4bf3c92 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added `AsyncGeneratorProcessor`. This processor can be used together with a
+  `FrameSerializer` as an async generator. It provides a `generator()` function
+  that returns an `AsyncGenerator` and that yields serialized frames.
+
 - Added `EndTaskFrame` and `CancelTaskFrame`. These are new frames that are
   meant to be pushed upstream to tell the pipeline task to stop nicely or
   immediately respectively.
diff --git a/src/pipecat/processors/async_generator.py b/src/pipecat/processors/async_generator.py
new file mode 100644
index 000000000..66b2a3e99
--- /dev/null
+++ b/src/pipecat/processors/async_generator.py
@@ -0,0 +1,42 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+
+from typing import Any, AsyncGenerator
+
+from pipecat.frames.frames import (
+    CancelFrame,
+    EndFrame,
+    Frame,
+)
+from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
+from pipecat.serializers.base_serializer import FrameSerializer
+
+
+class AsyncGeneratorProcessor(FrameProcessor):
+    def __init__(self, *, serializer: FrameSerializer, **kwargs):
+        super().__init__(**kwargs)
+        self._serializer = serializer
+        self._data_queue = asyncio.Queue()
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, (CancelFrame, EndFrame)):
+            await self._data_queue.put(None)
+        else:
+            data = self._serializer.serialize(frame)
+            if data:
+                await self._data_queue.put(data)
+
+    async def generator(self) -> AsyncGenerator[Any, None]:
+        running = True
+        while running:
+            data = await self._data_queue.get()
+            running = data is not None
+            if data:
+                yield data

From 3621fceae2555ca24fac993086b19241740f780a Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Wed, 25 Sep 2024 09:19:28 -0700
Subject: [PATCH 19/60] fixes as noted by aleix

---
 src/pipecat/services/ai_services.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index 197067adc..b32eb708d 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -259,6 +259,10 @@ def __init__(
     async def flush_audio(self):
         pass
 
+    async def say(self, text: str):
+        await super.say(text)
+        await self.flush_audio()
+
     async def start(self, frame: StartFrame):
         await super().start(frame)
         if self._push_stop_frames:
@@ -278,6 +282,11 @@ async def cancel(self, frame: CancelFrame):
             await self._stop_frame_task
             self._stop_frame_task = None
 
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        super().process_frame(frame, direction)
+        if isinstance(frame, TTSSpeakFrame):
+            await self.flush_audio()
+
     async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
         await super().push_frame(frame, direction)
 

From 3d43ad0f4dcb9570202516dd0095a26226853aca Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Wed, 25 Sep 2024 10:59:00 -0700
Subject: [PATCH 20/60] actually save the file

---
 src/pipecat/services/ai_services.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index b32eb708d..21ef8dfca 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -283,7 +283,7 @@ async def cancel(self, frame: CancelFrame):
             self._stop_frame_task = None
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
-        super().process_frame(frame, direction)
+        await super().process_frame(frame, direction)
         if isinstance(frame, TTSSpeakFrame):
             await self.flush_audio()
 

From c4e94e280eca9e6a2fcbfacae2bcba8b3448f57d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Wed, 25 Sep 2024 16:35:33 -0700
Subject: [PATCH 21/60] processors: add support for event handlers

---
 CHANGELOG.md                              | 10 ++++++++
 src/pipecat/processors/frame_processor.py | 30 +++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6c4bf3c92..f35978dcd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- All `FrameProcessors` can now register event handlers.
+
+```
+tts = SomeTTSService(...)
+
+@tts.event_handler("on_connected"):
+async def on_connected(processor):
+  ...
+```
+
 - Added `AsyncGeneratorProcessor`. This processor can be used together with a
   `FrameSerializer` as an async generator. It provides a `generator()` function
   that returns an `AsyncGenerator` and that yields serialized frames.
diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py
index b56846aa6..f71e066d7 100644
--- a/src/pipecat/processors/frame_processor.py
+++ b/src/pipecat/processors/frame_processor.py
@@ -5,6 +5,7 @@
 #
 
 import asyncio
+import inspect
 
 from enum import Enum
 
@@ -48,6 +49,8 @@ def __init__(
         self._loop: asyncio.AbstractEventLoop = loop or asyncio.get_running_loop()
         self._sync = sync
 
+        self._event_handlers: dict = {}
+
         # Clock
         self._clock: BaseClock | None = None
 
@@ -169,6 +172,23 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect
         else:
             await self.__push_queue.put((frame, direction))
 
+    def event_handler(self, event_name: str):
+        def decorator(handler):
+            self.add_event_handler(event_name, handler)
+            return handler
+
+        return decorator
+
+    def add_event_handler(self, event_name: str, handler):
+        if event_name not in self._event_handlers:
+            raise Exception(f"Event handler {event_name} not registered")
+        self._event_handlers[event_name].append(handler)
+
+    def _register_event_handler(self, event_name: str):
+        if event_name in self._event_handlers:
+            raise Exception(f"Event handler {event_name} already registered")
+        self._event_handlers[event_name] = []
+
     #
     # Handle interruptions
     #
@@ -212,5 +232,15 @@ async def __push_frame_task_handler(self):
             except asyncio.CancelledError:
                 break
 
+    async def _call_event_handler(self, event_name: str, *args, **kwargs):
+        try:
+            for handler in self._event_handlers[event_name]:
+                if inspect.iscoroutinefunction(handler):
+                    await handler(self, *args, **kwargs)
+                else:
+                    handler(self, *args, **kwargs)
+        except Exception as e:
+            logger.exception(f"Exception in event handler {event_name}: {e}")
+
     def __str__(self):
         return self.name

From f06aa300d01b202f018265c3e880f147566b302c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Wed, 25 Sep 2024 16:35:49 -0700
Subject: [PATCH 22/60] rtvi: add on_bot_ready event

---
 src/pipecat/processors/frameworks/rtvi.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
index 1458cc21d..930b2331d 100644
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -324,6 +324,8 @@ def __init__(
         self._message_task = self.get_event_loop().create_task(self._message_task_handler())
         self._message_queue = asyncio.Queue()
 
+        self._register_event_handler("on_bot_ready")
+
     def register_action(self, action: RTVIAction):
         id = self._action_id(action.service, action.action)
         self._registered_actions[id] = action
@@ -624,8 +626,9 @@ async def _handle_action(self, request_id: str | None, data: RTVIActionRun):
 
     async def _maybe_send_bot_ready(self):
         if self._pipeline_started and self._client_ready:
-            await self._send_bot_ready()
             await self._update_config(self._config, False)
+            await self._send_bot_ready()
+            await self._call_event_handler("on_bot_ready")
 
     async def _send_bot_ready(self):
         if not self._params.send_bot_ready:

From 73da8c1910fc7ae37f3f1ac6bf8c50d89aabd927 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Wed, 25 Sep 2024 22:40:36 -0400
Subject: [PATCH 23/60] Improve usability of Deepgram TTS: use Deepgram client,
 remove aiohttp

---
 .../07c-interruptible-deepgram.py             | 16 ++--
 src/pipecat/services/deepgram.py              | 85 +++++++++----------
 2 files changed, 47 insertions(+), 54 deletions(-)

diff --git a/examples/foundational/07c-interruptible-deepgram.py b/examples/foundational/07c-interruptible-deepgram.py
index 41bef8a47..fc33c246f 100644
--- a/examples/foundational/07c-interruptible-deepgram.py
+++ b/examples/foundational/07c-interruptible-deepgram.py
@@ -5,10 +5,14 @@
 #
 
 import asyncio
-import aiohttp
 import os
 import sys
 
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
 from pipecat.frames.frames import LLMMessagesFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
@@ -22,12 +26,6 @@
 from pipecat.transports.services.daily import DailyParams, DailyTransport
 from pipecat.vad.silero import SileroVADAnalyzer
 
-from runner import configure
-
-from loguru import logger
-
-from dotenv import load_dotenv
-
 load_dotenv(override=True)
 
 logger.remove(0)
@@ -52,9 +50,7 @@ async def main():
 
         stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
 
-        tts = DeepgramTTSService(
-            aiohttp_session=session, api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en"
-        )
+        tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
 
         llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
 
diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py
index 914bc2ec2..6929e66e5 100644
--- a/src/pipecat/services/deepgram.py
+++ b/src/pipecat/services/deepgram.py
@@ -4,10 +4,11 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-import aiohttp
-
+import asyncio
 from typing import AsyncGenerator
 
+from loguru import logger
+
 from pipecat.frames.frames import (
     CancelFrame,
     EndFrame,
@@ -15,27 +16,25 @@
     Frame,
     InterimTranscriptionFrame,
     StartFrame,
+    TranscriptionFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
-    TranscriptionFrame,
 )
 from pipecat.services.ai_services import STTService, TTSService
 from pipecat.transcriptions.language import Language
 from pipecat.utils.time import time_now_iso8601
 
-from loguru import logger
-
-
 # See .env.example for Deepgram configuration needed
 try:
     from deepgram import (
         AsyncListenWebSocketClient,
         DeepgramClient,
         DeepgramClientOptions,
-        LiveTranscriptionEvents,
         LiveOptions,
         LiveResultResponse,
+        LiveTranscriptionEvents,
+        SpeakOptions,
     )
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
@@ -50,9 +49,7 @@ def __init__(
         self,
         *,
         api_key: str,
-        aiohttp_session: aiohttp.ClientSession,
         voice: str = "aura-helios-en",
-        base_url: str = "https://api.deepgram.com/v1/speak",
         sample_rate: int = 16000,
         encoding: str = "linear16",
         **kwargs,
@@ -60,11 +57,9 @@ def __init__(
         super().__init__(**kwargs)
 
         self._voice = voice
-        self._api_key = api_key
-        self._base_url = base_url
         self._sample_rate = sample_rate
         self._encoding = encoding
-        self._aiohttp_session = aiohttp_session
+        self._deepgram_client = DeepgramClient(api_key=api_key)
 
     def can_generate_metrics(self) -> bool:
         return True
@@ -76,43 +71,45 @@ async def set_voice(self, voice: str):
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         logger.debug(f"Generating TTS: [{text}]")
 
-        base_url = self._base_url
-        request_url = f"{base_url}?model={self._voice}&encoding={self._encoding}&container=none&sample_rate={self._sample_rate}"
-        headers = {"authorization": f"token {self._api_key}"}
-        body = {"text": text}
+        options = SpeakOptions(
+            model=self._voice,
+            encoding=self._encoding,
+            sample_rate=self._sample_rate,
+            container="none",
+        )
 
         try:
             await self.start_ttfb_metrics()
-            async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
-                if r.status != 200:
-                    response_text = await r.text()
-                    # If we get a a "Bad Request: Input is unutterable", just print out a debug log.
-                    # All other unsuccesful requests should emit an error frame. If not specifically
-                    # handled by the running PipelineTask, the ErrorFrame will cancel the task.
-                    if "unutterable" in response_text:
-                        logger.debug(f"Unutterable text: [{text}]")
-                        return
-
-                    logger.error(
-                        f"{self} error getting audio (status: {r.status}, error: {response_text})"
-                    )
-                    yield ErrorFrame(
-                        f"Error getting audio (status: {r.status}, error: {response_text})"
-                    )
-                    return
-
-                await self.start_tts_usage_metrics(text)
-
-                await self.push_frame(TTSStartedFrame())
-                async for data in r.content:
-                    await self.stop_ttfb_metrics()
-                    frame = TTSAudioRawFrame(
-                        audio=data, sample_rate=self._sample_rate, num_channels=1
-                    )
-                    yield frame
-                await self.push_frame(TTSStoppedFrame())
+
+            response = await asyncio.to_thread(
+                self._deepgram_client.speak.v("1").stream, {"text": text}, options
+            )
+
+            await self.start_tts_usage_metrics(text)
+            await self.push_frame(TTSStartedFrame())
+
+            # The response.stream_memory is already a BytesIO object
+            audio_buffer = response.stream_memory
+
+            if audio_buffer is None:
+                raise ValueError("No audio data received from Deepgram")
+
+            # Read and yield the audio data in chunks
+            audio_buffer.seek(0)  # Ensure we're at the start of the buffer
+            chunk_size = 8192  # Use a fixed buffer size
+            while True:
+                await self.stop_ttfb_metrics()
+                chunk = audio_buffer.read(chunk_size)
+                if not chunk:
+                    break
+                frame = TTSAudioRawFrame(audio=chunk, sample_rate=self._sample_rate, num_channels=1)
+                yield frame
+
+            await self.push_frame(TTSStoppedFrame())
+
         except Exception as e:
             logger.exception(f"{self} exception: {e}")
+            yield ErrorFrame(f"Error getting audio: {str(e)}")
 
 
 class DeepgramSTTService(STTService):

From d05717a1bd709ecbf475bb2ff410763bcd8783cd Mon Sep 17 00:00:00 2001
From: Jin Kim <golbin@gmail.com>
Date: Thu, 26 Sep 2024 19:52:25 +0900
Subject: [PATCH 24/60] Apply Ruff formater

---
 src/pipecat/services/cartesia.py | 41 ++++++++++++++++----------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index f08c06dea..c1c296046 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -71,15 +71,16 @@ class InputParams(BaseModel):
         emotion: Optional[List[str]] = []
 
     def __init__(
-            self,
-            *,
-            api_key: str,
-            voice_id: str,
-            cartesia_version: str = "2024-06-10",
-            url: str = "wss://api.cartesia.ai/tts/websocket",
-            model_id: str = "sonic-english",
-            params: InputParams = InputParams(),
-            **kwargs):
+        self,
+        *,
+        api_key: str,
+        voice_id: str,
+        cartesia_version: str = "2024-06-10",
+        url: str = "wss://api.cartesia.ai/tts/websocket",
+        model_id: str = "sonic-english",
+        params: InputParams = InputParams(),
+        **kwargs,
+    ):
         # Aggregating sentences still gives cleaner-sounding results and fewer
         # artifacts than streaming one word at a time. On average, waiting for a
         # full sentence should only "cost" us 15ms or so with GPT-4o or a Llama
@@ -91,7 +92,10 @@ def __init__(
         # can use those to generate text frames ourselves aligned with the
         # playout timing of the audio!
         super().__init__(
-            aggregate_sentences=True, push_text_frames=False, sample_rate=params.sample_rate, **kwargs
+            aggregate_sentences=True,
+            push_text_frames=False,
+            sample_rate=params.sample_rate,
+            **kwargs,
         )
 
         self._api_key = api_key
@@ -137,11 +141,10 @@ async def set_language(self, language: Language):
         logger.debug(f"Switching TTS language to: [{language}]")
         self._language = language_to_cartesia_language(language)
 
-    def _build_msg(self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True):
-        voice_config = {
-            "mode": "id",
-            "id": self._voice_id
-        }
+    def _build_msg(
+        self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True
+    ):
+        voice_config = {"mode": "id", "id": self._voice_id}
 
         if self._speed or self._emotion:
             voice_config["__experimental_controls"] = {}
@@ -236,8 +239,7 @@ async def _receive_task_handler(self):
                     await self.add_word_timestamps([("LLMFullResponseEndFrame", 0)])
                 elif msg["type"] == "timestamps":
                     await self.add_word_timestamps(
-                        list(zip(msg["word_timestamps"]["words"],
-                             msg["word_timestamps"]["start"]))
+                        list(zip(msg["word_timestamps"]["words"], msg["word_timestamps"]["start"]))
                     )
                 elif msg["type"] == "chunk":
                     await self.stop_ttfb_metrics()
@@ -254,8 +256,7 @@ async def _receive_task_handler(self):
                     await self.stop_all_metrics()
                     await self.push_error(ErrorFrame(f'{self} error: {msg["error"]}'))
                 else:
-                    logger.error(
-                        f"Cartesia error, unknown message type: {msg}")
+                    logger.error(f"Cartesia error, unknown message type: {msg}")
         except asyncio.CancelledError:
             pass
         except Exception as e:
@@ -379,7 +380,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 output_format=self._output_format,
                 language=self._language,
                 stream=False,
-                _experimental_voice_controls=voice_controls
+                _experimental_voice_controls=voice_controls,
             )
 
             await self.stop_ttfb_metrics()

From f5e0b946c74695549f26135c8b92546fff22270d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Thu, 26 Sep 2024 09:08:37 -0700
Subject: [PATCH 25/60] services(cartesia): fix string formatting

---
 src/pipecat/services/cartesia.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index c1c296046..e38d56db3 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -180,8 +180,7 @@ async def cancel(self, frame: CancelFrame):
     async def _connect(self):
         try:
             self._websocket = await websockets.connect(
-                f"{self._url}?api_key={self._api_key}&cartesia_version={
-                    self._cartesia_version}"
+                f"{self._url}?api_key={self._api_key}&cartesia_version={self._cartesia_version}"
             )
             self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
         except Exception as e:

From c7c709a0a79a27c542761c23f06606ec6d3dc0dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Thu, 26 Sep 2024 10:31:53 -0700
Subject: [PATCH 26/60] github: cache venv when running tests

---
 .github/workflows/tests.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index ce3e13494..b806efad4 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -27,6 +27,13 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: "3.10"
+      - name: Cache virtual environment
+        uses: actions/cache@v3
+        with:
+          # We are hashing dev-requirements.txt and test-requirements.txt which
+          # contain all dependencies needed to run the tests.
+          key: venv-${{ runner.os }}-${{ steps.setup_python.outputs.python-version}}-${{ hashFiles('dev-requirements.txt') }}-${{ hashFiles('test-requirements.txt') }}
+          path: .venv
       - name: Install system packages
         id: install_system_packages
         run: |

From 6a6ea251aec1f32425d9a5fb0031119feaaceed5 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Thu, 26 Sep 2024 09:31:09 -0400
Subject: [PATCH 27/60] Add AWS Polly TTS support

---
 README.md                                     |   4 +-
 dot-env.template                              |   5 +
 .../foundational/07m-interruptible-aws.py     |  98 ++++++++++++
 pyproject.toml                                |   1 +
 src/pipecat/services/aws.py                   | 146 ++++++++++++++++++
 test-requirements.txt                         |   1 +
 6 files changed, 252 insertions(+), 3 deletions(-)
 create mode 100644 examples/foundational/07m-interruptible-aws.py
 create mode 100644 src/pipecat/services/aws.py

diff --git a/README.md b/README.md
index faf0137dc..793d1f630 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ pip install "pipecat-ai[option,...]"
 
 Your project may or may not need these, so they're made available as optional requirements. Here is a list:
 
-- **AI services**: `anthropic`, `azure`, `deepgram`, `gladia`, `google`, `fal`, `lmnt`, `moondream`, `openai`, `openpipe`, `playht`, `silero`, `whisper`, `xtts`
+- **AI services**: `anthropic`, `aws`, `azure`, `deepgram`, `gladia`, `google`, `fal`, `lmnt`, `moondream`, `openai`, `openpipe`, `playht`, `silero`, `whisper`, `xtts`
 - **Transports**: `local`, `websocket`, `daily`
 
 ## Code examples
@@ -110,7 +110,6 @@ python app.py
 
 Daily provides a prebuilt WebRTC user interface. Whilst the app is running, you can visit at `https://<yourdomain>.daily.co/<room_url>` and listen to the bot say hello!
 
-
 ## WebRTC for production use
 
 WebSockets are fine for server-to-server communication or for initial development. But for production use, you’ll need client-server audio to use a protocol designed for real-time media transport. (For an explanation of the difference between WebSockets and WebRTC, see [this post.](https://www.daily.co/blog/how-to-talk-to-an-llm-with-your-voice/#webrtc))
@@ -131,7 +130,6 @@ pip install pipecat-ai[silero]
 
 The first time your run your bot with Silero, startup may take a while whilst it downloads and caches the model in the background. You can check the progress of this in the console.
 
-
 ## Hacking on the framework itself
 
 _Note that you may need to set up a virtual environment before following the instructions below. For instance, you might need to run the following from the root of the repo:_
diff --git a/dot-env.template b/dot-env.template
index 085e8b19d..e940b1076 100644
--- a/dot-env.template
+++ b/dot-env.template
@@ -1,6 +1,11 @@
 # Anthropic
 ANTHROPIC_API_KEY=...
 
+# AWS
+AWS_SECRET_ACCESS_KEY=...
+AWS_ACCESS_KEY_ID=...
+AWS_REGION=...
+
 # Azure
 AZURE_SPEECH_REGION=...
 AZURE_SPEECH_API_KEY=...
diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py
new file mode 100644
index 000000000..891ffd381
--- /dev/null
+++ b/examples/foundational/07m-interruptible-aws.py
@@ -0,0 +1,98 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import os
+import sys
+
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
+from pipecat.frames.frames import LLMMessagesFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.llm_response import (
+    LLMAssistantResponseAggregator,
+    LLMUserResponseAggregator,
+)
+from pipecat.services.aws import AWSTTSService
+from pipecat.services.openai import OpenAILLMService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+from pipecat.vad.silero import SileroVADAnalyzer
+
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        (room_url, token) = await configure(session)
+
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_out_enabled=True,
+                audio_out_sample_rate=16000,
+                transcription_enabled=True,
+                vad_enabled=True,
+                vad_analyzer=SileroVADAnalyzer(),
+            ),
+        )
+
+        tts = AWSTTSService(
+            api_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
+            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
+            region=os.getenv("AWS_REGION"),
+            voice_id="Amy",
+            params=AWSTTSService.InputParams(engine="neural", language="en-GB", rate="1.05"),
+        )
+
+        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        tma_in = LLMUserResponseAggregator(messages)
+        tma_out = LLMAssistantResponseAggregator(messages)
+
+        pipeline = Pipeline(
+            [
+                transport.input(),  # Transport user input
+                tma_in,  # User responses
+                llm,  # LLM
+                tts,  # TTS
+                transport.output(),  # Transport bot output
+                tma_out,  # Assistant spoken responses
+            ]
+        )
+
+        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            transport.capture_participant_transcription(participant["id"])
+            # Kick off the conversation.
+            messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+            await task.queue_frames([LLMMessagesFrame(messages)])
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/pyproject.toml b/pyproject.toml
index 46345ed71..8dcfd7cb0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ Website = "https://pipecat.ai"
 
 [project.optional-dependencies]
 anthropic = [ "anthropic~=0.34.0" ]
+aws = [ "boto3~=1.35.27" ]
 azure = [ "azure-cognitiveservices-speech~=1.40.0" ]
 cartesia = [ "cartesia~=1.0.13", "websockets~=12.0" ]
 daily = [ "daily-python~=0.10.1" ]
diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py
new file mode 100644
index 000000000..dfca10131
--- /dev/null
+++ b/src/pipecat/services/aws.py
@@ -0,0 +1,146 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from typing import AsyncGenerator, Optional
+
+from loguru import logger
+from pydantic import BaseModel
+
+from pipecat.frames.frames import (
+    Frame,
+    TTSAudioRawFrame,
+    TTSStartedFrame,
+    TTSStoppedFrame,
+)
+from pipecat.services.ai_services import TTSService
+
+try:
+    import boto3
+    from botocore.exceptions import BotoCoreError, ClientError
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use Deepgram, you need to `pip install pipecat-ai[aws]`. Also, set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable."
+    )
+    raise Exception(f"Missing module: {e}")
+
+
+class AWSTTSService(TTSService):
+    class InputParams(BaseModel):
+        engine: Optional[str] = None
+        language: Optional[str] = None
+        pitch: Optional[str] = None
+        rate: Optional[str] = None
+        volume: Optional[str] = None
+
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        aws_access_key_id: str,
+        region: str,
+        voice_id: str = "Joanna",
+        sample_rate: int = 16000,
+        params: InputParams = InputParams(),
+        **kwargs,
+    ):
+        super().__init__(sample_rate=sample_rate, **kwargs)
+
+        self._polly_client = boto3.client(
+            "polly",
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=api_key,
+            region_name=region,
+        )
+        self._voice_id = voice_id
+        self._sample_rate = sample_rate
+        self._params = params
+
+    def can_generate_metrics(self) -> bool:
+        return True
+
+    def _construct_ssml(self, text: str) -> str:
+        ssml = "<speak>"
+
+        if self._params.language:
+            ssml += f"<lang xml:lang='{self._params.language}'>"
+
+        prosody_attrs = []
+        # Prosody tags are only supported for standard and neural engines
+        if self._params.engine != "generative":
+            if self._params.rate:
+                prosody_attrs.append(f"rate='{self._params.rate}'")
+            if self._params.pitch:
+                prosody_attrs.append(f"pitch='{self._params.pitch}'")
+            if self._params.volume:
+                prosody_attrs.append(f"volume='{self._params.volume}'")
+
+            if prosody_attrs:
+                ssml += f"<prosody {' '.join(prosody_attrs)}>"
+        else:
+            logger.warning("Prosody tags are not supported for generative engine. Ignoring.")
+
+        ssml += text
+
+        if prosody_attrs:
+            ssml += "</prosody>"
+
+        if self._params.language:
+            ssml += "</lang>"
+
+        ssml += "</speak>"
+
+        return ssml
+
+    async def set_voice(self, voice: str):
+        logger.debug(f"Switching TTS voice to: [{voice}]")
+        self._voice_id = voice
+
+    async def set_engine(self, engine: str):
+        logger.debug(f"Switching TTS engine to: [{engine}]")
+        self._params.engine = engine
+
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        logger.debug(f"Generating TTS: [{text}]")
+
+        try:
+            await self.start_ttfb_metrics()
+
+            # Construct the parameters dictionary
+            ssml = self._construct_ssml(text)
+
+            params = {
+                "Text": ssml,
+                "TextType": "ssml",
+                "OutputFormat": "pcm",
+                "VoiceId": self._voice_id,
+                "Engine": self._params.engine,
+                "SampleRate": str(self._sample_rate),
+            }
+
+            # Filter out None values
+            filtered_params = {k: v for k, v in params.items() if v is not None}
+
+            response = self._polly_client.synthesize_speech(**filtered_params)
+
+            await self.start_tts_usage_metrics(text)
+
+            await self.push_frame(TTSStartedFrame())
+
+            if "AudioStream" in response:
+                with response["AudioStream"] as stream:
+                    audio_data = stream.read()
+                    chunk_size = 4096  # You can adjust this value
+                    for i in range(0, len(audio_data), chunk_size):
+                        chunk = audio_data[i : i + chunk_size]
+                        if len(chunk) > 0:
+                            await self.stop_ttfb_metrics()
+                            frame = TTSAudioRawFrame(chunk, self._sample_rate, 1)
+                            yield frame
+
+            await self.push_frame(TTSStoppedFrame())
+
+        except (BotoCoreError, ClientError) as error:
+            logger.exception(f"{self} error generating TTS: {error}")
diff --git a/test-requirements.txt b/test-requirements.txt
index 78280b139..94c81331d 100644
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -1,6 +1,7 @@
 aiohttp~=3.10.3
 anthropic~=0.30.0
 azure-cognitiveservices-speech~=1.40.0
+boto3~=1.35.27
 daily-python~=0.10.1
 deepgram-sdk~=3.5.0
 fal-client~=0.4.1

From 298b1514862c6a374869c051e6b44a517f2c706c Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Thu, 26 Sep 2024 13:05:39 -0400
Subject: [PATCH 28/60] Add setter methods

---
 src/pipecat/services/aws.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py
index dfca10131..b32ab05f0 100644
--- a/src/pipecat/services/aws.py
+++ b/src/pipecat/services/aws.py
@@ -102,6 +102,26 @@ async def set_engine(self, engine: str):
         logger.debug(f"Switching TTS engine to: [{engine}]")
         self._params.engine = engine
 
+    async def set_language(self, language: str):
+        logger.debug(f"Switching TTS language to: [{language}]")
+        self._params.language = language
+
+    async def set_pitch(self, pitch: str):
+        logger.debug(f"Switching TTS pitch to: [{pitch}]")
+        self._params.pitch = pitch
+
+    async def set_rate(self, rate: str):
+        logger.debug(f"Switching TTS rate to: [{rate}]")
+        self._params.rate = rate
+
+    async def set_volume(self, volume: str):
+        logger.debug(f"Switching TTS volume to: [{volume}]")
+        self._params.volume = volume
+
+    async def set_params(self, params: InputParams):
+        logger.debug(f"Switching TTS params to: [{params}]")
+        self._params = params
+
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         logger.debug(f"Generating TTS: [{text}]")
 

From d3a477902b079388067ed41bb969de4454fae07b Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Thu, 26 Sep 2024 13:08:11 -0400
Subject: [PATCH 29/60] Add changelog entry

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f35978dcd..c7a525c82 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added AWS Polly TTS support.
+
+- Added InputParams to Azure TTS service.
+
 - All `FrameProcessors` can now register event handlers.
 
 ```

From b8ece84c6ecf4bf74119371f241a4499d675b7e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Thu, 26 Sep 2024 10:39:00 -0700
Subject: [PATCH 30/60] services: super should be super()

---
 src/pipecat/services/ai_services.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index 21ef8dfca..16280b024 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -260,7 +260,7 @@ async def flush_audio(self):
         pass
 
     async def say(self, text: str):
-        await super.say(text)
+        await super().say(text)
         await self.flush_audio()
 
     async def start(self, frame: StartFrame):

From d323ea9e95960b17f8a541cbd616bd4272de1caa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Thu, 26 Sep 2024 16:44:49 -0700
Subject: [PATCH 31/60] async_generator: keep pushing frames downstream

---
 src/pipecat/processors/async_generator.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/pipecat/processors/async_generator.py b/src/pipecat/processors/async_generator.py
index 66b2a3e99..4f9bc85d0 100644
--- a/src/pipecat/processors/async_generator.py
+++ b/src/pipecat/processors/async_generator.py
@@ -26,6 +26,8 @@ def __init__(self, *, serializer: FrameSerializer, **kwargs):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
+        await self.push_frame(frame, direction)
+
         if isinstance(frame, (CancelFrame, EndFrame)):
             await self._data_queue.put(None)
         else:

From 706c00d89704f6797e464df9de7c5587bdffd719 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Thu, 26 Sep 2024 22:13:37 -0400
Subject: [PATCH 32/60] Code review feedback

---
 CHANGELOG.md                                   | 2 +-
 examples/foundational/07m-interruptible-aws.py | 6 +++++-
 src/pipecat/services/aws.py                    | 8 +++++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c7a525c82..474f06989 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Added AWS Polly TTS support.
+- Added AWS Polly TTS support and `07m-interruptible-aws.py` as an example.
 
 - Added InputParams to Azure TTS service.
 
diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py
index 891ffd381..69d4b84c1 100644
--- a/examples/foundational/07m-interruptible-aws.py
+++ b/examples/foundational/07m-interruptible-aws.py
@@ -22,6 +22,7 @@
     LLMUserResponseAggregator,
 )
 from pipecat.services.aws import AWSTTSService
+from pipecat.services.deepgram import DeepgramSTTService
 from pipecat.services.openai import OpenAILLMService
 from pipecat.transports.services.daily import DailyParams, DailyTransport
 from pipecat.vad.silero import SileroVADAnalyzer
@@ -43,12 +44,14 @@ async def main():
             DailyParams(
                 audio_out_enabled=True,
                 audio_out_sample_rate=16000,
-                transcription_enabled=True,
                 vad_enabled=True,
                 vad_analyzer=SileroVADAnalyzer(),
+                vad_audio_passthrough=True,
             ),
         )
 
+        stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+
         tts = AWSTTSService(
             api_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
             aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
@@ -72,6 +75,7 @@ async def main():
         pipeline = Pipeline(
             [
                 transport.input(),  # Transport user input
+                stt,  # STT
                 tma_in,  # User responses
                 llm,  # LLM
                 tts,  # TTS
diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py
index b32ab05f0..f3b2766bc 100644
--- a/src/pipecat/services/aws.py
+++ b/src/pipecat/services/aws.py
@@ -9,6 +9,7 @@
 from pydantic import BaseModel
 
 from pipecat.frames.frames import (
+    ErrorFrame,
     Frame,
     TTSAudioRawFrame,
     TTSStartedFrame,
@@ -152,7 +153,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
             if "AudioStream" in response:
                 with response["AudioStream"] as stream:
                     audio_data = stream.read()
-                    chunk_size = 4096  # You can adjust this value
+                    chunk_size = 8192
                     for i in range(0, len(audio_data), chunk_size):
                         chunk = audio_data[i : i + chunk_size]
                         if len(chunk) > 0:
@@ -164,3 +165,8 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
 
         except (BotoCoreError, ClientError) as error:
             logger.exception(f"{self} error generating TTS: {error}")
+            error_message = f"AWS Polly TTS error: {str(error)}"
+            yield ErrorFrame(error=error_message)
+
+        finally:
+            await self.push_frame(TTSStoppedFrame())

From 2a05cd35b0b5a295126d88a2170c0fd78cad0695 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Thu, 26 Sep 2024 16:45:20 -0700
Subject: [PATCH 33/60] rtvi: add multiple RTVI frame processors

---
 src/pipecat/processors/frameworks/rtvi.py | 281 +++++++++++++++++-----
 1 file changed, 220 insertions(+), 61 deletions(-)

diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
index 930b2331d..03d63c7f0 100644
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -5,6 +5,7 @@
 #
 
 import asyncio
+import base64
 
 from typing import Any, Awaitable, Callable, Dict, List, Literal, Optional, Union
 from pydantic import BaseModel, Field, PrivateAttr, ValidationError
@@ -20,8 +21,14 @@
     ErrorFrame,
     Frame,
     InterimTranscriptionFrame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    OutputAudioRawFrame,
     StartFrame,
     SystemFrame,
+    TTSStartedFrame,
+    TTSStoppedFrame,
+    TextFrame,
     TranscriptionFrame,
     TransportMessageFrame,
     UserStartedSpeakingFrame,
@@ -242,33 +249,75 @@ class RTVILLMFunctionCallResultData(BaseModel):
     result: dict | str
 
 
+class RTVIBotLLMStartedMessage(BaseModel):
+    label: Literal["rtvi-ai"] = "rtvi-ai"
+    type: Literal["bot-llm-started"] = "bot-llm-started"
+
+
+class RTVIBotLLMStoppedMessage(BaseModel):
+    label: Literal["rtvi-ai"] = "rtvi-ai"
+    type: Literal["bot-llm-stopped"] = "bot-llm-stopped"
+
+
+class RTVIBotTTSStartedMessage(BaseModel):
+    label: Literal["rtvi-ai"] = "rtvi-ai"
+    type: Literal["bot-tts-started"] = "bot-tts-started"
+
+
+class RTVIBotTTSStoppedMessage(BaseModel):
+    label: Literal["rtvi-ai"] = "rtvi-ai"
+    type: Literal["bot-tts-stopped"] = "bot-tts-stopped"
+
+
 class RTVITextMessageData(BaseModel):
     text: str
 
 
-class RTVILLMTextMessage(BaseModel):
+class RTVIBotLLMTextMessage(BaseModel):
     label: Literal["rtvi-ai"] = "rtvi-ai"
-    type: Literal["llm-text"] = "llm-text"
+    type: Literal["bot-llm-text"] = "bot-llm-text"
     data: RTVITextMessageData
 
 
-class RTVITTSTextMessage(BaseModel):
+class RTVIBotTTSTextMessage(BaseModel):
     label: Literal["rtvi-ai"] = "rtvi-ai"
-    type: Literal["tts-text"] = "tts-text"
+    type: Literal["bot-tts-text"] = "bot-tts-text"
     data: RTVITextMessageData
 
 
-class RTVITranscriptionMessageData(BaseModel):
+class RTVIAudioMessageData(BaseModel):
+    audio: str
+    sample_rate: int
+    num_channels: int
+
+
+class RTVIBotAudioMessage(BaseModel):
+    label: Literal["rtvi-ai"] = "rtvi-ai"
+    type: Literal["bot-audio"] = "bot-audio"
+    data: RTVIAudioMessageData
+
+
+class RTVIBotTranscriptionMessageData(BaseModel):
+    text: str
+
+
+class RTVIBotTranscriptionMessage(BaseModel):
+    label: Literal["rtvi-ai"] = "rtvi-ai"
+    type: Literal["bot-transcription"] = "bot-transcription"
+    data: RTVIBotTranscriptionMessageData
+
+
+class RTVIUserTranscriptionMessageData(BaseModel):
     text: str
     user_id: str
     timestamp: str
     final: bool
 
 
-class RTVITranscriptionMessage(BaseModel):
+class RTVIUserTranscriptionMessage(BaseModel):
     label: Literal["rtvi-ai"] = "rtvi-ai"
     type: Literal["user-transcription"] = "user-transcription"
-    data: RTVITranscriptionMessageData
+    data: RTVIUserTranscriptionMessageData
 
 
 class RTVIUserStartedSpeakingMessage(BaseModel):
@@ -295,6 +344,170 @@ class RTVIProcessorParams(BaseModel):
     send_bot_ready: bool = True
 
 
+class RTVIFrameProcessor(FrameProcessor):
+    def __init__(self, direction: FrameDirection = FrameDirection.DOWNSTREAM, **kwargs):
+        super().__init__(**kwargs)
+        self._direction = direction
+
+    async def _push_transport_message(self, model: BaseModel, exclude_none: bool = True):
+        frame = TransportMessageFrame(
+            message=model.model_dump(exclude_none=exclude_none), urgent=True
+        )
+        await self.push_frame(frame, self._direction)
+
+
+class RTVISpeakingProcessor(RTVIFrameProcessor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        await self.push_frame(frame, direction)
+
+        if isinstance(frame, (UserStartedSpeakingFrame, UserStoppedSpeakingFrame)):
+            await self._handle_interruptions(frame)
+        elif isinstance(frame, (BotStartedSpeakingFrame, BotStoppedSpeakingFrame)):
+            await self._handle_bot_speaking(frame)
+
+    async def _handle_interruptions(self, frame: Frame):
+        message = None
+        if isinstance(frame, UserStartedSpeakingFrame):
+            message = RTVIUserStartedSpeakingMessage()
+        elif isinstance(frame, UserStoppedSpeakingFrame):
+            message = RTVIUserStoppedSpeakingMessage()
+
+        if message:
+            await self._push_transport_message(message)
+
+    async def _handle_bot_speaking(self, frame: Frame):
+        message = None
+        if isinstance(frame, BotStartedSpeakingFrame):
+            message = RTVIBotStartedSpeakingMessage()
+        elif isinstance(frame, BotStoppedSpeakingFrame):
+            message = RTVIBotStoppedSpeakingMessage()
+
+        if message:
+            await self._push_transport_message(message)
+
+
+class RTVIUserTranscriptionProcessor(RTVIFrameProcessor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        await self.push_frame(frame, direction)
+
+        if isinstance(frame, (TranscriptionFrame, InterimTranscriptionFrame)):
+            await self._handle_user_transcriptions(frame)
+
+    async def _handle_user_transcriptions(self, frame: Frame):
+        message = None
+        if isinstance(frame, TranscriptionFrame):
+            message = RTVIUserTranscriptionMessage(
+                data=RTVIUserTranscriptionMessageData(
+                    text=frame.text, user_id=frame.user_id, timestamp=frame.timestamp, final=True
+                )
+            )
+        elif isinstance(frame, InterimTranscriptionFrame):
+            message = RTVIUserTranscriptionMessage(
+                data=RTVIUserTranscriptionMessageData(
+                    text=frame.text, user_id=frame.user_id, timestamp=frame.timestamp, final=False
+                )
+            )
+
+        if message:
+            await self._push_transport_message(message)
+
+
+class RTVIBotLLMProcessor(RTVIFrameProcessor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        await self.push_frame(frame, direction)
+
+        if isinstance(frame, LLMFullResponseStartFrame):
+            await self._push_transport_message(RTVIBotLLMStartedMessage())
+        elif isinstance(frame, LLMFullResponseEndFrame):
+            await self._push_transport_message(RTVIBotLLMStoppedMessage())
+
+
+class RTVIBotTTSProcessor(RTVIFrameProcessor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        await self.push_frame(frame, direction)
+
+        if isinstance(frame, TTSStartedFrame):
+            await self._push_transport_message(RTVIBotTTSStartedMessage())
+        elif isinstance(frame, TTSStoppedFrame):
+            await self._push_transport_message(RTVIBotTTSStoppedMessage())
+
+
+class RTVIBotLLMTextProcessor(RTVIFrameProcessor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        await self.push_frame(frame, direction)
+
+        if isinstance(frame, TextFrame):
+            await self._handle_text(frame)
+
+    async def _handle_text(self, frame: TextFrame):
+        message = RTVIBotLLMTextMessage(data=RTVITextMessageData(text=frame.text))
+        await self._push_transport_message(message)
+
+
+class RTVIBotTTSTextProcessor(RTVIFrameProcessor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        await self.push_frame(frame, direction)
+
+        if isinstance(frame, TextFrame):
+            await self._handle_text(frame)
+
+    async def _handle_text(self, frame: TextFrame):
+        message = RTVIBotTTSTextMessage(data=RTVITextMessageData(text=frame.text))
+        await self._push_transport_message(message)
+
+
+class RTVIBotAudioProcessor(RTVIFrameProcessor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        await self.push_frame(frame, direction)
+
+        if isinstance(frame, OutputAudioRawFrame):
+            await self._handle_audio(frame)
+
+    async def _handle_audio(self, frame: OutputAudioRawFrame):
+        encoded = base64.b64encode(frame.audio).decode("utf-8")
+        message = RTVIBotAudioMessage(
+            data=RTVIAudioMessageData(
+                audio=encoded, sample_rate=frame.sample_rate, num_channels=frame.num_channels
+            )
+        )
+        await self._push_transport_message(message)
+
+
 class RTVIProcessor(FrameProcessor):
     def __init__(
         self,
@@ -394,20 +607,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             # finish and the task finishes when EndFrame is processed.
             await self.push_frame(frame, direction)
             await self._stop(frame)
-        elif isinstance(frame, UserStartedSpeakingFrame) or isinstance(
-            frame, UserStoppedSpeakingFrame
-        ):
-            await self._handle_interruptions(frame)
-            await self.push_frame(frame, direction)
-        elif isinstance(frame, BotStartedSpeakingFrame) or isinstance(
-            frame, BotStoppedSpeakingFrame
-        ):
-            await self._handle_bot_speaking(frame)
-            await self.push_frame(frame, direction)
         # Data frames
-        elif isinstance(frame, TranscriptionFrame) or isinstance(frame, InterimTranscriptionFrame):
-            await self._handle_transcriptions(frame)
-            await self.push_frame(frame, direction)
         elif isinstance(frame, TransportMessageFrame):
             await self._message_queue.put(frame)
         elif isinstance(frame, RTVIActionFrame):
@@ -452,47 +652,6 @@ async def _push_transport_message(self, model: BaseModel, exclude_none: bool = T
         )
         await self.push_frame(frame)
 
-    async def _handle_transcriptions(self, frame: Frame):
-        # TODO(aleix): Once we add support for using custom pipelines, the STTs will
-        # be in the pipeline after this processor.
-
-        message = None
-        if isinstance(frame, TranscriptionFrame):
-            message = RTVITranscriptionMessage(
-                data=RTVITranscriptionMessageData(
-                    text=frame.text, user_id=frame.user_id, timestamp=frame.timestamp, final=True
-                )
-            )
-        elif isinstance(frame, InterimTranscriptionFrame):
-            message = RTVITranscriptionMessage(
-                data=RTVITranscriptionMessageData(
-                    text=frame.text, user_id=frame.user_id, timestamp=frame.timestamp, final=False
-                )
-            )
-
-        if message:
-            await self._push_transport_message(message)
-
-    async def _handle_interruptions(self, frame: Frame):
-        message = None
-        if isinstance(frame, UserStartedSpeakingFrame):
-            message = RTVIUserStartedSpeakingMessage()
-        elif isinstance(frame, UserStoppedSpeakingFrame):
-            message = RTVIUserStoppedSpeakingMessage()
-
-        if message:
-            await self._push_transport_message(message)
-
-    async def _handle_bot_speaking(self, frame: Frame):
-        message = None
-        if isinstance(frame, BotStartedSpeakingFrame):
-            message = RTVIBotStartedSpeakingMessage()
-        elif isinstance(frame, BotStoppedSpeakingFrame):
-            message = RTVIBotStoppedSpeakingMessage()
-
-        if message:
-            await self._push_transport_message(message)
-
     async def _action_task_handler(self):
         while True:
             try:

From 6e8a202107ad31f4d0987792128fab662321c5b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Thu, 26 Sep 2024 22:42:19 -0700
Subject: [PATCH 34/60] rtvi: fix handling transport messages

---
 src/pipecat/processors/frameworks/rtvi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
index 03d63c7f0..9721a6613 100644
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -609,7 +609,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             await self._stop(frame)
         # Data frames
         elif isinstance(frame, TransportMessageFrame):
-            await self._message_queue.put(frame)
+            await self._handle_transport_message(frame)
         elif isinstance(frame, RTVIActionFrame):
             await self._action_queue.put(frame)
         # Other frames

From 2c8e5665076ad9f187982a196e3cab2f1bd11b68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Thu, 26 Sep 2024 22:42:36 -0700
Subject: [PATCH 35/60] rtvi: update version to 0.2

---
 src/pipecat/processors/frameworks/rtvi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
index 9721a6613..f88660f60 100644
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -41,7 +41,7 @@
 from loguru import logger
 
 
-RTVI_PROTOCOL_VERSION = "0.1"
+RTVI_PROTOCOL_VERSION = "0.2"
 
 ActionResult = Union[bool, int, float, str, list, dict]
 

From 830d2df671a93a8b2293ee9e4a7767fd6abc2874 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Thu, 26 Sep 2024 13:03:46 -0400
Subject: [PATCH 36/60] Add Google TTS

---
 CHANGELOG.md                                  |   2 +
 .../foundational/07n-interruptible-google.py  | 100 ++++++++
 pyproject.toml                                |   2 +-
 src/pipecat/services/google.py                | 217 +++++++++++++++++-
 test-requirements.txt                         |   1 +
 5 files changed, 312 insertions(+), 10 deletions(-)
 create mode 100644 examples/foundational/07n-interruptible-google.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 474f06989..37189eb47 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added Google TTS service and corresponding foundational example `07n-interruptible-google.py`
+
 - Added AWS Polly TTS support and `07m-interruptible-aws.py` as an example.
 
 - Added InputParams to Azure TTS service.
diff --git a/examples/foundational/07n-interruptible-google.py b/examples/foundational/07n-interruptible-google.py
new file mode 100644
index 000000000..713b3dce3
--- /dev/null
+++ b/examples/foundational/07n-interruptible-google.py
@@ -0,0 +1,100 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import os
+import sys
+
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
+from pipecat.frames.frames import LLMMessagesFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.llm_response import (
+    LLMAssistantResponseAggregator,
+    LLMUserResponseAggregator,
+)
+from pipecat.services.deepgram import DeepgramSTTService
+from pipecat.services.google import GoogleTTSService
+from pipecat.services.openai import OpenAILLMService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+from pipecat.vad.silero import SileroVADAnalyzer
+
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        (room_url, token) = await configure(session)
+
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_out_enabled=True,
+                audio_out_sample_rate=24000,
+                vad_enabled=True,
+                vad_analyzer=SileroVADAnalyzer(),
+                vad_audio_passthrough=True,
+            ),
+        )
+
+        stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+
+        tts = GoogleTTSService(
+            credentials=os.getenv("GOOGLE_CREDENTIALS"),
+            voice_id="en-US-Neural2-J",
+            params=GoogleTTSService.InputParams(language="en-US", rate="1.05"),
+        )
+
+        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        tma_in = LLMUserResponseAggregator(messages)
+        tma_out = LLMAssistantResponseAggregator(messages)
+
+        pipeline = Pipeline(
+            [
+                transport.input(),  # Transport user input
+                stt,  # STT
+                tma_in,  # User responses
+                llm,  # LLM
+                tts,  # TTS
+                transport.output(),  # Transport bot output
+                tma_out,  # Assistant spoken responses
+            ]
+        )
+
+        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            transport.capture_participant_transcription(participant["id"])
+            # Kick off the conversation.
+            messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+            await task.queue_frames([LLMMessagesFrame(messages)])
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/pyproject.toml b/pyproject.toml
index 8dcfd7cb0..876242343 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,7 @@ elevenlabs = [ "websockets~=12.0" ]
 examples = [ "python-dotenv~=1.0.1", "flask~=3.0.3", "flask_cors~=4.0.1" ]
 fal = [ "fal-client~=0.4.1" ]
 gladia = [ "websockets~=12.0" ]
-google = [ "google-generativeai~=0.7.2" ]
+google = [ "google-generativeai~=0.7.2", "google-cloud-texttospeech~=2.17.2" ]
 gstreamer = [ "pygobject~=3.48.2" ]
 fireworks = [ "openai~=1.37.2" ]
 langchain = [ "langchain~=0.2.14", "langchain-community~=0.2.12", "langchain-openai~=0.1.20" ]
diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py
index 4de6b77fa..2a0a7d1e1 100644
--- a/src/pipecat/services/google.py
+++ b/src/pipecat/services/google.py
@@ -5,30 +5,37 @@
 #
 
 import asyncio
+import json
+from typing import AsyncGenerator, List, Literal, Optional
 
-from typing import List
+import numpy as np
+from loguru import logger
+from pydantic import BaseModel
 
 from pipecat.frames.frames import (
     Frame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMMessagesFrame,
     LLMModelUpdateFrame,
     TextFrame,
+    TTSAudioRawFrame,
+    TTSStartedFrame,
+    TTSStoppedFrame,
     VisionImageRawFrame,
-    LLMMessagesFrame,
-    LLMFullResponseStartFrame,
-    LLMFullResponseEndFrame,
 )
-from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.ai_services import LLMService
 from pipecat.processors.aggregators.openai_llm_context import (
     OpenAILLMContext,
     OpenAILLMContextFrame,
 )
-
-from loguru import logger
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.ai_services import LLMService, TTSService
 
 try:
-    import google.generativeai as gai
     import google.ai.generativelanguage as glm
+    import google.generativeai as gai
+    from google.cloud import texttospeech_v1
+    from google.oauth2 import service_account
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
     logger.error(
@@ -137,3 +144,195 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
 
         if context:
             await self._process_context(context)
+
+
+class GoogleTTSService(TTSService):
+    class InputParams(BaseModel):
+        pitch: Optional[str] = None
+        rate: Optional[str] = None
+        volume: Optional[str] = None
+        emphasis: Optional[Literal["strong", "moderate", "reduced", "none"]] = None
+        language: Optional[str] = None
+        gender: Optional[Literal["male", "female", "neutral"]] = None
+        google_style: Optional[Literal["apologetic", "calm", "empathetic", "firm", "lively"]] = None
+
+    def __init__(
+        self,
+        *,
+        credentials: Optional[str] = None,
+        credentials_path: Optional[str] = None,
+        voice_id: str = "en-US-Neural2-A",
+        sample_rate: int = 24000,
+        params: InputParams = InputParams(),
+        **kwargs,
+    ):
+        super().__init__(sample_rate=sample_rate, **kwargs)
+
+        self._voice_id: str = voice_id
+        self._params = params
+        self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
+            credentials, credentials_path
+        )
+
+    def _create_client(
+        self, credentials: Optional[str], credentials_path: Optional[str]
+    ) -> texttospeech_v1.TextToSpeechAsyncClient:
+        creds: Optional[service_account.Credentials] = None
+
+        # Create a Google Cloud service account for the Cloud Text-to-Speech API
+        # Using either the provided credentials JSON string or the path to a service account JSON
+        # file, create a Google Cloud service account and use it to authenticate with the API.
+        if credentials:
+            # Use provided credentials JSON string
+            json_account_info = json.loads(credentials)
+            creds = service_account.Credentials.from_service_account_info(json_account_info)
+        elif credentials_path:
+            # Use service account JSON file if provided
+            creds = service_account.Credentials.from_service_account_file(credentials_path)
+        else:
+            raise ValueError("Either 'credentials' or 'credentials_path' must be provided.")
+
+        return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
+
+    def can_generate_metrics(self) -> bool:
+        return True
+
+    def _construct_ssml(self, text: str) -> str:
+        ssml = "<speak>"
+
+        # Voice tag
+        voice_attrs = [f"name='{self._voice_id}'"]
+        if self._params.language:
+            voice_attrs.append(f"language='{self._params.language}'")
+        if self._params.gender:
+            voice_attrs.append(f"gender='{self._params.gender}'")
+        ssml += f"<voice {' '.join(voice_attrs)}>"
+
+        # Prosody tag
+        prosody_attrs = []
+        if self._params.pitch:
+            prosody_attrs.append(f"pitch='{self._params.pitch}'")
+        if self._params.rate:
+            prosody_attrs.append(f"rate='{self._params.rate}'")
+        if self._params.volume:
+            prosody_attrs.append(f"volume='{self._params.volume}'")
+
+        if prosody_attrs:
+            ssml += f"<prosody {' '.join(prosody_attrs)}>"
+
+        # Emphasis tag
+        if self._params.emphasis:
+            ssml += f"<emphasis level='{self._params.emphasis}'>"
+
+        # Google style tag
+        if self._params.google_style:
+            ssml += f"<google:style name='{self._params.google_style}'>"
+
+        ssml += text
+
+        # Close tags
+        if self._params.google_style:
+            ssml += "</google:style>"
+        if self._params.emphasis:
+            ssml += "</emphasis>"
+        if prosody_attrs:
+            ssml += "</prosody>"
+        ssml += "</voice></speak>"
+
+        return ssml
+
+    async def set_voice(self, voice: str) -> None:
+        logger.debug(f"Switching TTS voice to: [{voice}]")
+        self._voice_id = voice
+
+    async def set_language(self, language: str) -> None:
+        logger.debug(f"Switching TTS language to: [{language}]")
+        self._params.language = language
+
+    async def set_pitch(self, pitch: str) -> None:
+        logger.debug(f"Switching TTS pitch to: [{pitch}]")
+        self._params.pitch = pitch
+
+    async def set_rate(self, rate: str) -> None:
+        logger.debug(f"Switching TTS rate to: [{rate}]")
+        self._params.rate = rate
+
+    async def set_volume(self, volume: str) -> None:
+        logger.debug(f"Switching TTS volume to: [{volume}]")
+        self._params.volume = volume
+
+    async def set_emphasis(
+        self, emphasis: Literal["strong", "moderate", "reduced", "none"]
+    ) -> None:
+        logger.debug(f"Switching TTS emphasis to: [{emphasis}]")
+        self._params.emphasis = emphasis
+
+    async def set_gender(self, gender: Literal["male", "female", "neutral"]) -> None:
+        logger.debug(f"Switch TTS gender to [{gender}]")
+        self._params.gender = gender
+
+    async def google_style(
+        self, google_style: Literal["apologetic", "calm", "empathetic", "firm", "lively"]
+    ) -> None:
+        logger.debug(f"Switching TTS google style to: [{google_style}]")
+        self._params.google_style = google_style
+
+    async def set_params(self, params: InputParams) -> None:
+        logger.debug(f"Switching TTS params to: [{params}]")
+        self._params = params
+
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        logger.debug(f"Generating TTS: [{text}]")
+
+        try:
+            await self.start_ttfb_metrics()
+
+            ssml = self._construct_ssml(text)
+            synthesis_input = texttospeech_v1.SynthesisInput(ssml=ssml)
+            voice = texttospeech_v1.VoiceSelectionParams(
+                language_code=self._params.language, name=self._voice_id
+            )
+            audio_config = texttospeech_v1.AudioConfig(
+                audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16,
+                sample_rate_hertz=self.sample_rate,
+            )
+
+            request = texttospeech_v1.SynthesizeSpeechRequest(
+                input=synthesis_input, voice=voice, audio_config=audio_config
+            )
+
+            response = await self._client.synthesize_speech(request=request)
+
+            await self.start_tts_usage_metrics(text)
+
+            await self.push_frame(TTSStartedFrame())
+
+            # The audio produced by the TTS service has an audible click or pop at the beginning.
+            # This is due to the abrupt start of the audio waveform. To mitigate this, we apply a
+            # short fade-in effect to the audio data.
+
+            # Convert the response to a mutable numpy array
+            audio_content = np.frombuffer(response.audio_content, dtype=np.int16).copy()
+
+            # Apply a smooth, short fade-in
+            fade_duration = int(0.01 * self.sample_rate)  # 10ms fade-in
+            fade_in = np.square(
+                np.linspace(0, 1, fade_duration)
+            )  # Quadratic fade for smoother start
+            audio_content[:fade_duration] = audio_content[:fade_duration] * fade_in
+
+            # Read and yield audio data in chunks
+            chunk_size = 8192
+            for i in range(0, len(audio_content), chunk_size):
+                chunk = audio_content[i : i + chunk_size].tobytes()
+                if not chunk:
+                    break
+                await self.stop_ttfb_metrics()
+                frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
+                yield frame
+                await asyncio.sleep(0)  # Allow other tasks to run
+
+            await self.push_frame(TTSStoppedFrame())
+
+        except Exception as e:
+            logger.exception(f"{self} error generating TTS: {e}")
diff --git a/test-requirements.txt b/test-requirements.txt
index 94c81331d..8c7db7377 100644
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -7,6 +7,7 @@ deepgram-sdk~=3.5.0
 fal-client~=0.4.1
 fastapi~=0.112.1
 faster-whisper~=1.0.3
+google-cloud-texttospeech~=2.17.2
 google-generativeai~=0.7.2
 langchain~=0.2.14
 livekit~=0.13.1

From e7548f9494cc8cd0ab27eba8dae53bb3c9f3af36 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Fri, 27 Sep 2024 08:02:44 -0400
Subject: [PATCH 37/60] Code review feedback

---
 src/pipecat/services/google.py | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py
index 2a0a7d1e1..38af3e41f 100644
--- a/src/pipecat/services/google.py
+++ b/src/pipecat/services/google.py
@@ -8,11 +8,11 @@
 import json
 from typing import AsyncGenerator, List, Literal, Optional
 
-import numpy as np
 from loguru import logger
 from pydantic import BaseModel
 
 from pipecat.frames.frames import (
+    ErrorFrame,
     Frame,
     LLMFullResponseEndFrame,
     LLMFullResponseStartFrame,
@@ -307,24 +307,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
 
             await self.push_frame(TTSStartedFrame())
 
-            # The audio produced by the TTS service has an audible click or pop at the beginning.
-            # This is due to the abrupt start of the audio waveform. To mitigate this, we apply a
-            # short fade-in effect to the audio data.
-
-            # Convert the response to a mutable numpy array
-            audio_content = np.frombuffer(response.audio_content, dtype=np.int16).copy()
-
-            # Apply a smooth, short fade-in
-            fade_duration = int(0.01 * self.sample_rate)  # 10ms fade-in
-            fade_in = np.square(
-                np.linspace(0, 1, fade_duration)
-            )  # Quadratic fade for smoother start
-            audio_content[:fade_duration] = audio_content[:fade_duration] * fade_in
+            # Skip the first 44 bytes to remove the WAV header
+            audio_content = response.audio_content[44:]
 
             # Read and yield audio data in chunks
             chunk_size = 8192
             for i in range(0, len(audio_content), chunk_size):
-                chunk = audio_content[i : i + chunk_size].tobytes()
+                chunk = audio_content[i : i + chunk_size]
                 if not chunk:
                     break
                 await self.stop_ttfb_metrics()
@@ -336,3 +325,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
 
         except Exception as e:
             logger.exception(f"{self} error generating TTS: {e}")
+            error_message = f"TTS generation error: {str(e)}"
+            yield ErrorFrame(error=error_message)
+        finally:
+            await self.push_frame(TTSStoppedFrame())

From 50b6580fbb44b6f686fc249ae2fd160b8f60947b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Fri, 27 Sep 2024 13:28:33 -0700
Subject: [PATCH 38/60] livekit: add license notice

---
 src/pipecat/transports/services/livekit.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/transports/services/livekit.py b/src/pipecat/transports/services/livekit.py
index 52bbbf89d..6e5e48d0b 100644
--- a/src/pipecat/transports/services/livekit.py
+++ b/src/pipecat/transports/services/livekit.py
@@ -1,10 +1,17 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
 import asyncio
+
 from dataclasses import dataclass
 from typing import Any, Awaitable, Callable, List
 
-import numpy as np
-from loguru import logger
 from pydantic import BaseModel
+
+import numpy as np
 from scipy import signal
 
 from pipecat.frames.frames import (
@@ -28,6 +35,8 @@
 from pipecat.transports.base_transport import BaseTransport, TransportParams
 from pipecat.vad.vad_analyzer import VADAnalyzer
 
+from loguru import logger
+
 try:
     from livekit import rtc
     from tenacity import retry, stop_after_attempt, wait_exponential

From d9b16d4f738f405287bfc79fd350852c48fb829a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Fri, 27 Sep 2024 13:32:27 -0700
Subject: [PATCH 39/60] services: import cosmetics

---
 src/pipecat/services/aws.py      | 4 +++-
 src/pipecat/services/azure.py    | 9 ++++++---
 src/pipecat/services/deepgram.py | 5 +++--
 src/pipecat/services/fal.py      | 3 ++-
 src/pipecat/services/openai.py   | 9 +++++----
 src/pipecat/services/xtts.py     | 4 ++--
 6 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py
index f3b2766bc..80240985f 100644
--- a/src/pipecat/services/aws.py
+++ b/src/pipecat/services/aws.py
@@ -3,9 +3,9 @@
 #
 # SPDX-License-Identifier: BSD 2-Clause License
 #
+
 from typing import AsyncGenerator, Optional
 
-from loguru import logger
 from pydantic import BaseModel
 
 from pipecat.frames.frames import (
@@ -17,6 +17,8 @@
 )
 from pipecat.services.ai_services import TTSService
 
+from loguru import logger
+
 try:
     import boto3
     from botocore.exceptions import BotoCoreError, ClientError
diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py
index 41fc7598b..c8fa095ab 100644
--- a/src/pipecat/services/azure.py
+++ b/src/pipecat/services/azure.py
@@ -4,13 +4,12 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
+import aiohttp
 import asyncio
 import io
+
 from typing import AsyncGenerator, Optional
 
-import aiohttp
-from loguru import logger
-from PIL import Image
 from pydantic import BaseModel
 
 from pipecat.frames.frames import (
@@ -29,6 +28,10 @@
 from pipecat.services.openai import BaseOpenAILLMService
 from pipecat.utils.time import time_now_iso8601
 
+from PIL import Image
+
+from loguru import logger
+
 # See .env.example for Azure configuration needed
 try:
     from azure.cognitiveservices.speech import (
diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py
index 6929e66e5..d109cce3c 100644
--- a/src/pipecat/services/deepgram.py
+++ b/src/pipecat/services/deepgram.py
@@ -5,9 +5,8 @@
 #
 
 import asyncio
-from typing import AsyncGenerator
 
-from loguru import logger
+from typing import AsyncGenerator
 
 from pipecat.frames.frames import (
     CancelFrame,
@@ -25,6 +24,8 @@
 from pipecat.transcriptions.language import Language
 from pipecat.utils.time import time_now_iso8601
 
+from loguru import logger
+
 # See .env.example for Deepgram configuration needed
 try:
     from deepgram import (
diff --git a/src/pipecat/services/fal.py b/src/pipecat/services/fal.py
index bb7b47dfc..aecdeb709 100644
--- a/src/pipecat/services/fal.py
+++ b/src/pipecat/services/fal.py
@@ -8,13 +8,14 @@
 import io
 import os
 
-from PIL import Image
 from pydantic import BaseModel
 from typing import AsyncGenerator, Optional, Union, Dict
 
 from pipecat.frames.frames import ErrorFrame, Frame, URLImageRawFrame
 from pipecat.services.ai_services import ImageGenService
 
+from PIL import Image
+
 from loguru import logger
 
 try:
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index e54898525..47bee5ec1 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -9,14 +9,12 @@
 import io
 import json
 import httpx
+
 from dataclasses import dataclass
 
 from typing import Any, AsyncGenerator, Dict, List, Literal, Optional
 from pydantic import BaseModel, Field
 
-from loguru import logger
-from PIL import Image
-
 from pipecat.frames.frames import (
     ErrorFrame,
     Frame,
@@ -39,7 +37,6 @@
     LLMUserContextAggregator,
     LLMAssistantContextAggregator,
 )
-
 from pipecat.processors.aggregators.openai_llm_context import (
     OpenAILLMContext,
     OpenAILLMContextFrame,
@@ -47,6 +44,10 @@
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.ai_services import ImageGenService, LLMService, TTSService
 
+from PIL import Image
+
+from loguru import logger
+
 try:
     from openai import AsyncOpenAI, AsyncStream, DefaultAsyncHttpxClient, BadRequestError, NOT_GIVEN
     from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam
diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py
index 5161efcf6..2c47d59e8 100644
--- a/src/pipecat/services/xtts.py
+++ b/src/pipecat/services/xtts.py
@@ -18,10 +18,10 @@
 )
 from pipecat.services.ai_services import TTSService
 
-from loguru import logger
-
 import numpy as np
 
+from loguru import logger
+
 try:
     import resampy
 except ModuleNotFoundError as e:

From 44a349386c135df45beb07c32b82f6e17747c2c7 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Fri, 27 Sep 2024 10:43:49 -0400
Subject: [PATCH 40/60] Consolidate update frames classes into a single
 UpdateSettingsFrame class

---
 CHANGELOG.md                        |   3 +
 src/pipecat/frames/frames.py        | 121 +++++-----------------------
 src/pipecat/services/ai_services.py |  38 +++++----
 src/pipecat/services/anthropic.py   |  63 +++++++++------
 src/pipecat/services/google.py      |   9 ++-
 src/pipecat/services/openai.py      |  54 ++++++++-----
 src/pipecat/services/together.py    |  62 ++++++++------
 7 files changed, 157 insertions(+), 193 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 37189eb47..0f489556c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -93,6 +93,9 @@ async def on_connected(processor):
 
 ### Changed
 
+- Updated individual update settings frame classes into a single UpdateSettingsFrame
+  class for STT, LLM, and TTS.
+
 - We now distinguish between input and output audio and image frames. We
   introduce `InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame`
   and `OutputImageRawFrame` (and other subclasses of those). The input frames
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index 273aad214..1b31b9c88 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -4,9 +4,8 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-from typing import Any, List, Optional, Tuple
-
 from dataclasses import dataclass, field
+from typing import Any, List, Optional, Tuple
 
 from pipecat.clocks.base_clock import BaseClock
 from pipecat.metrics.metrics import MetricsData
@@ -528,113 +527,35 @@ def __str__(self):
 
 
 @dataclass
-class LLMModelUpdateFrame(ControlFrame):
-    """A control frame containing a request to update to a new LLM model."""
-
-    model: str
-
-
-@dataclass
-class LLMTemperatureUpdateFrame(ControlFrame):
-    """A control frame containing a request to update to a new LLM temperature."""
-
-    temperature: float
-
-
-@dataclass
-class LLMTopKUpdateFrame(ControlFrame):
-    """A control frame containing a request to update to a new LLM top_k."""
-
-    top_k: int
-
-
-@dataclass
-class LLMTopPUpdateFrame(ControlFrame):
-    """A control frame containing a request to update to a new LLM top_p."""
-
-    top_p: float
-
-
-@dataclass
-class LLMFrequencyPenaltyUpdateFrame(ControlFrame):
-    """A control frame containing a request to update to a new LLM frequency
-    penalty.
-
-    """
-
-    frequency_penalty: float
-
-
-@dataclass
-class LLMPresencePenaltyUpdateFrame(ControlFrame):
-    """A control frame containing a request to update to a new LLM presence
-    penalty.
+class LLMUpdateSettingsFrame(ControlFrame):
+    """A control frame containing a request to update LLM settings."""
 
-    """
-
-    presence_penalty: float
+    model: Optional[str] = None
+    temperature: Optional[float] = None
+    top_k: Optional[int] = None
+    top_p: Optional[float] = None
+    frequency_penalty: Optional[float] = None
+    presence_penalty: Optional[float] = None
+    max_tokens: Optional[int] = None
+    seed: Optional[int] = None
+    extra: dict = field(default_factory=dict)
 
 
 @dataclass
-class LLMMaxTokensUpdateFrame(ControlFrame):
-    """A control frame containing a request to update to a new LLM max tokens."""
-
-    max_tokens: int
-
-
-@dataclass
-class LLMSeedUpdateFrame(ControlFrame):
-    """A control frame containing a request to update to a new LLM seed."""
-
-    seed: int
-
-
-@dataclass
-class LLMExtraUpdateFrame(ControlFrame):
-    """A control frame containing a request to update to a new LLM extra params."""
-
-    extra: dict
-
-
-@dataclass
-class TTSModelUpdateFrame(ControlFrame):
-    """A control frame containing a request to update the TTS model."""
-
-    model: str
-
-
-@dataclass
-class TTSVoiceUpdateFrame(ControlFrame):
-    """A control frame containing a request to update to a new TTS voice."""
-
-    voice: str
-
-
-@dataclass
-class TTSLanguageUpdateFrame(ControlFrame):
-    """A control frame containing a request to update to a new TTS language and
-    optional voice.
-
-    """
-
-    language: Language
-
-
-@dataclass
-class STTModelUpdateFrame(ControlFrame):
-    """A control frame containing a request to update the STT model and optional
-    language.
-
-    """
+class TTSUpdateSettingsFrame(ControlFrame):
+    """A control frame containing a request to update TTS settings."""
 
-    model: str
+    model: Optional[str] = None
+    voice: Optional[str] = None
+    language: Optional[Language] = None
 
 
 @dataclass
-class STTLanguageUpdateFrame(ControlFrame):
-    """A control frame containing a request to update to STT language."""
+class STTUpdateSettingsFrame(ControlFrame):
+    """A control frame containing a request to update STT settings."""
 
-    language: Language
+    model: Optional[str] = None
+    language: Optional[Language] = None
 
 
 @dataclass
diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index 16280b024..79e52531d 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -7,10 +7,11 @@
 import asyncio
 import io
 import wave
-
 from abc import abstractmethod
 from typing import AsyncGenerator, List, Optional, Tuple
 
+from loguru import logger
+
 from pipecat.frames.frames import (
     AudioRawFrame,
     CancelFrame,
@@ -18,31 +19,26 @@
     ErrorFrame,
     Frame,
     LLMFullResponseEndFrame,
-    STTLanguageUpdateFrame,
-    STTModelUpdateFrame,
     StartFrame,
     StartInterruptionFrame,
+    STTUpdateSettingsFrame,
+    TextFrame,
     TTSAudioRawFrame,
-    TTSLanguageUpdateFrame,
-    TTSModelUpdateFrame,
     TTSSpeakFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
-    TTSVoiceUpdateFrame,
-    TextFrame,
+    TTSUpdateSettingsFrame,
     UserImageRequestFrame,
     VisionImageRawFrame,
 )
 from pipecat.metrics.metrics import MetricsData
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transcriptions.language import Language
 from pipecat.utils.audio import calculate_audio_volume
 from pipecat.utils.string import match_endofsentence
 from pipecat.utils.time import seconds_to_nanoseconds
 from pipecat.utils.utils import exp_smoothing
-from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-
-from loguru import logger
 
 
 class AIService(FrameProcessor):
@@ -230,12 +226,13 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
                 await self.push_frame(frame, direction)
         elif isinstance(frame, TTSSpeakFrame):
             await self._push_tts_frames(frame.text)
-        elif isinstance(frame, TTSModelUpdateFrame):
-            await self.set_model(frame.model)
-        elif isinstance(frame, TTSVoiceUpdateFrame):
-            await self.set_voice(frame.voice)
-        elif isinstance(frame, TTSLanguageUpdateFrame):
-            await self.set_language(frame.language)
+        elif isinstance(frame, TTSUpdateSettingsFrame):
+            if frame.model is not None:
+                await self.set_model(frame.model)
+            if frame.voice is not None:
+                await self.set_voice(frame.voice)
+            if frame.language is not None:
+                await self.set_language(frame.language)
         else:
             await self.push_frame(frame, direction)
 
@@ -408,10 +405,11 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             # In this service we accumulate audio internally and at the end we
             # push a TextFrame. We don't really want to push audio frames down.
             await self.process_audio_frame(frame)
-        elif isinstance(frame, STTModelUpdateFrame):
-            await self.set_model(frame.model)
-        elif isinstance(frame, STTLanguageUpdateFrame):
-            await self.set_language(frame.language)
+        elif isinstance(frame, STTUpdateSettingsFrame):
+            if frame.model is not None:
+                await self.set_model(frame.model)
+            if frame.language is not None:
+                await self.set_language(frame.language)
         else:
             await self.push_frame(frame, direction)
 
diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py
index 8b8e187ea..1c4cd284e 100644
--- a/src/pipecat/services/anthropic.py
+++ b/src/pipecat/services/anthropic.py
@@ -5,47 +5,47 @@
 #
 
 import base64
-import json
-import io
 import copy
-from typing import Any, Dict, List, Optional
+import io
+import json
+import re
+from asyncio import CancelledError
 from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from loguru import logger
 from PIL import Image
-from asyncio import CancelledError
-import re
 from pydantic import BaseModel, Field
 
 from pipecat.frames.frames import (
     Frame,
+    FunctionCallInProgressFrame,
+    FunctionCallResultFrame,
     LLMEnablePromptCachingFrame,
-    LLMModelUpdateFrame,
-    TextFrame,
-    VisionImageRawFrame,
-    UserImageRequestFrame,
-    UserImageRawFrame,
-    LLMMessagesFrame,
-    LLMFullResponseStartFrame,
     LLMFullResponseEndFrame,
-    FunctionCallResultFrame,
-    FunctionCallInProgressFrame,
+    LLMFullResponseStartFrame,
+    LLMMessagesFrame,
+    LLMUpdateSettingsFrame,
     StartInterruptionFrame,
+    TextFrame,
+    UserImageRawFrame,
+    UserImageRequestFrame,
+    VisionImageRawFrame,
 )
 from pipecat.metrics.metrics import LLMTokenUsage
-from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.ai_services import LLMService
+from pipecat.processors.aggregators.llm_response import (
+    LLMAssistantContextAggregator,
+    LLMUserContextAggregator,
+)
 from pipecat.processors.aggregators.openai_llm_context import (
     OpenAILLMContext,
     OpenAILLMContextFrame,
 )
-from pipecat.processors.aggregators.llm_response import (
-    LLMUserContextAggregator,
-    LLMAssistantContextAggregator,
-)
-
-from loguru import logger
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.ai_services import LLMService
 
 try:
-    from anthropic import AsyncAnthropic, NOT_GIVEN, NotGiven
+    from anthropic import NOT_GIVEN, AsyncAnthropic, NotGiven
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
     logger.error(
@@ -293,9 +293,20 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             # UserImageRawFrames coming through the pipeline and add them
             # to the context.
             context = AnthropicLLMContext.from_image_frame(frame)
-        elif isinstance(frame, LLMModelUpdateFrame):
-            logger.debug(f"Switching LLM model to: [{frame.model}]")
-            self.set_model_name(frame.model)
+        elif isinstance(frame, LLMUpdateSettingsFrame):
+            if frame.model is not None:
+                logger.debug(f"Switching LLM model to: [{frame.model}]")
+                self.set_model_name(frame.model)
+            if frame.max_tokens is not None:
+                await self.set_max_tokens(frame.max_tokens)
+            if frame.temperature is not None:
+                await self.set_temperature(frame.temperature)
+            if frame.top_k is not None:
+                await self.set_top_k(frame.top_k)
+            if frame.top_p is not None:
+                await self.set_top_p(frame.top_p)
+            if frame.extra:
+                await self.set_extra(frame.extra)
         elif isinstance(frame, LLMEnablePromptCachingFrame):
             logger.debug(f"Setting enable prompt caching to: [{frame.enable}]")
             self._enable_prompt_caching_beta = frame.enable
diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py
index 38af3e41f..53efd8c17 100644
--- a/src/pipecat/services/google.py
+++ b/src/pipecat/services/google.py
@@ -17,7 +17,7 @@
     LLMFullResponseEndFrame,
     LLMFullResponseStartFrame,
     LLMMessagesFrame,
-    LLMModelUpdateFrame,
+    LLMUpdateSettingsFrame,
     TextFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
@@ -136,9 +136,10 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             context = OpenAILLMContext.from_messages(frame.messages)
         elif isinstance(frame, VisionImageRawFrame):
             context = OpenAILLMContext.from_image_frame(frame)
-        elif isinstance(frame, LLMModelUpdateFrame):
-            logger.debug(f"Switching LLM model to: [{frame.model}]")
-            self._create_client(frame.model)
+        elif isinstance(frame, LLMUpdateSettingsFrame):
+            if frame.model is not None:
+                logger.debug(f"Switching LLM model to: [{frame.model}]")
+                self.set_model_name(frame.model)
         else:
             await self.push_frame(frame, direction)
 
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index 47bee5ec1..a830b65a8 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -4,38 +4,39 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-import aiohttp
 import base64
 import io
 import json
-import httpx
-
 from dataclasses import dataclass
-
 from typing import Any, AsyncGenerator, Dict, List, Literal, Optional
+
+import aiohttp
+import httpx
+from loguru import logger
+from PIL import Image
 from pydantic import BaseModel, Field
 
 from pipecat.frames.frames import (
     ErrorFrame,
     Frame,
+    FunctionCallInProgressFrame,
+    FunctionCallResultFrame,
     LLMFullResponseEndFrame,
     LLMFullResponseStartFrame,
     LLMMessagesFrame,
-    LLMModelUpdateFrame,
+    LLMUpdateSettingsFrame,
+    StartInterruptionFrame,
+    TextFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
-    TextFrame,
     URLImageRawFrame,
     VisionImageRawFrame,
-    FunctionCallResultFrame,
-    FunctionCallInProgressFrame,
-    StartInterruptionFrame,
 )
 from pipecat.metrics.metrics import LLMTokenUsage
 from pipecat.processors.aggregators.llm_response import (
-    LLMUserContextAggregator,
     LLMAssistantContextAggregator,
+    LLMUserContextAggregator,
 )
 from pipecat.processors.aggregators.openai_llm_context import (
     OpenAILLMContext,
@@ -44,12 +45,14 @@
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.ai_services import ImageGenService, LLMService, TTSService
 
-from PIL import Image
-
-from loguru import logger
-
 try:
-    from openai import AsyncOpenAI, AsyncStream, DefaultAsyncHttpxClient, BadRequestError, NOT_GIVEN
+    from openai import (
+        NOT_GIVEN,
+        AsyncOpenAI,
+        AsyncStream,
+        BadRequestError,
+        DefaultAsyncHttpxClient,
+    )
     from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
@@ -280,9 +283,22 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             context = OpenAILLMContext.from_messages(frame.messages)
         elif isinstance(frame, VisionImageRawFrame):
             context = OpenAILLMContext.from_image_frame(frame)
-        elif isinstance(frame, LLMModelUpdateFrame):
-            logger.debug(f"Switching LLM model to: [{frame.model}]")
-            self.set_model_name(frame.model)
+        elif isinstance(frame, LLMUpdateSettingsFrame):
+            if frame.model is not None:
+                logger.debug(f"Switching LLM model to: [{frame.model}]")
+                self.set_model_name(frame.model)
+            if frame.frequency_penalty is not None:
+                await self.set_frequency_penalty(frame.frequency_penalty)
+            if frame.presence_penalty is not None:
+                await self.set_presence_penalty(frame.presence_penalty)
+            if frame.seed is not None:
+                await self.set_seed(frame.seed)
+            if frame.temperature is not None:
+                await self.set_temperature(frame.temperature)
+            if frame.top_p is not None:
+                await self.set_top_p(frame.top_p)
+            if frame.extra:
+                await self.set_extra(frame.extra)
         else:
             await self.push_frame(frame, direction)
 
@@ -464,7 +480,7 @@ async def process_frame(self, frame, direction):
                 await self._push_aggregation()
             else:
                 logger.warning(
-                    f"FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id"
+                    "FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id"
                 )
                 self._function_call_in_progress = None
                 self._function_call_result = None
diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py
index b1365bc69..981aa6de2 100644
--- a/src/pipecat/services/together.py
+++ b/src/pipecat/services/together.py
@@ -7,37 +7,36 @@
 import json
 import re
 import uuid
-from pydantic import BaseModel, Field
-
-from typing import Any, Dict, List, Optional
-from dataclasses import dataclass
 from asyncio import CancelledError
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from loguru import logger
+from pydantic import BaseModel, Field
 
 from pipecat.frames.frames import (
     Frame,
-    LLMModelUpdateFrame,
-    TextFrame,
-    UserImageRequestFrame,
-    LLMMessagesFrame,
-    LLMFullResponseStartFrame,
-    LLMFullResponseEndFrame,
-    FunctionCallResultFrame,
     FunctionCallInProgressFrame,
+    FunctionCallResultFrame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMMessagesFrame,
+    LLMUpdateSettingsFrame,
     StartInterruptionFrame,
+    TextFrame,
+    UserImageRequestFrame,
 )
 from pipecat.metrics.metrics import LLMTokenUsage
-from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.ai_services import LLMService
+from pipecat.processors.aggregators.llm_response import (
+    LLMAssistantContextAggregator,
+    LLMUserContextAggregator,
+)
 from pipecat.processors.aggregators.openai_llm_context import (
     OpenAILLMContext,
     OpenAILLMContextFrame,
 )
-from pipecat.processors.aggregators.llm_response import (
-    LLMUserContextAggregator,
-    LLMAssistantContextAggregator,
-)
-
-from loguru import logger
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.ai_services import LLMService
 
 try:
     from together import AsyncTogether
@@ -188,7 +187,7 @@ async def _process_context(self, context: OpenAILLMContext):
                 if chunk.choices[0].finish_reason == "eos" and accumulating_function_call:
                     await self._extract_function_call(context, function_call_accumulator)
 
-        except CancelledError as e:
+        except CancelledError:
             # todo: implement token counting estimates for use when the user interrupts a long generation
             # we do this in the anthropic.py service
             raise
@@ -206,9 +205,24 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             context = frame.context
         elif isinstance(frame, LLMMessagesFrame):
             context = TogetherLLMContext.from_messages(frame.messages)
-        elif isinstance(frame, LLMModelUpdateFrame):
-            logger.debug(f"Switching LLM model to: [{frame.model}]")
-            self.set_model_name(frame.model)
+        elif isinstance(frame, LLMUpdateSettingsFrame):
+            if frame.model is not None:
+                logger.debug(f"Switching LLM model to: [{frame.model}]")
+                self.set_model_name(frame.model)
+            if frame.frequency_penalty is not None:
+                await self.set_frequency_penalty(frame.frequency_penalty)
+            if frame.max_tokens is not None:
+                await self.set_max_tokens(frame.max_tokens)
+            if frame.presence_penalty is not None:
+                await self.set_presence_penalty(frame.presence_penalty)
+            if frame.temperature is not None:
+                await self.set_temperature(frame.temperature)
+            if frame.top_k is not None:
+                await self.set_top_k(frame.top_k)
+            if frame.top_p is not None:
+                await self.set_top_p(frame.top_p)
+            if frame.extra:
+                await self.set_extra(frame.extra)
         else:
             await self.push_frame(frame, direction)
 
@@ -338,7 +352,7 @@ async def process_frame(self, frame, direction):
                 await self._push_aggregation()
             else:
                 logger.warning(
-                    f"FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id"
+                    "FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id"
                 )
                 self._function_call_in_progress = None
                 self._function_call_result = None

From 7fe118ce639aeac6bd84c7c2e721f3cf6dd298e0 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Fri, 27 Sep 2024 11:22:03 -0400
Subject: [PATCH 41/60] Align use of language param across TTS services

---
 src/pipecat/services/azure.py      | 17 ++++++++++-------
 src/pipecat/services/elevenlabs.py | 10 +++++-----
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py
index c8fa095ab..a1349cefe 100644
--- a/src/pipecat/services/azure.py
+++ b/src/pipecat/services/azure.py
@@ -41,7 +41,10 @@
         SpeechRecognizer,
         SpeechSynthesizer,
     )
-    from azure.cognitiveservices.speech.audio import AudioStreamFormat, PushAudioInputStream
+    from azure.cognitiveservices.speech.audio import (
+        AudioStreamFormat,
+        PushAudioInputStream,
+    )
     from azure.cognitiveservices.speech.dialog import AudioConfig
     from openai import AsyncAzureOpenAI
 except ModuleNotFoundError as e:
@@ -73,7 +76,7 @@ def create_client(self, api_key=None, base_url=None, **kwargs):
 class AzureTTSService(TTSService):
     class InputParams(BaseModel):
         emphasis: Optional[str] = None
-        language_code: Optional[str] = "en-US"
+        language: Optional[str] = "en-US"
         pitch: Optional[str] = None
         rate: Optional[str] = "1.05"
         role: Optional[str] = None
@@ -105,7 +108,7 @@ def can_generate_metrics(self) -> bool:
 
     def _construct_ssml(self, text: str) -> str:
         ssml = (
-            f"<speak version='1.0' xml:lang='{self._params.language_code}' "
+            f"<speak version='1.0' xml:lang='{self._params.language}' "
             "xmlns='http://www.w3.org/2001/10/synthesis' "
             "xmlns:mstts='http://www.w3.org/2001/mstts'>"
             f"<voice name='{self._voice}'>"
@@ -155,9 +158,9 @@ async def set_emphasis(self, emphasis: str):
         logger.debug(f"Setting TTS emphasis to: [{emphasis}]")
         self._params.emphasis = emphasis
 
-    async def set_language_code(self, language_code: str):
-        logger.debug(f"Setting TTS language code to: [{language_code}]")
-        self._params.language_code = language_code
+    async def set_language(self, language: str):
+        logger.debug(f"Setting TTS language code to: [{language}]")
+        self._params.language = language
 
     async def set_pitch(self, pitch: str):
         logger.debug(f"Setting TTS pitch to: [{pitch}]")
@@ -187,7 +190,7 @@ async def set_params(self, **kwargs):
         valid_params = {
             "voice": self.set_voice,
             "emphasis": self.set_emphasis,
-            "language_code": self.set_language_code,
+            "language_code": self.set_language,
             "pitch": self.set_pitch,
             "rate": self.set_rate,
             "role": self.set_role,
diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py
index 79d90bc58..ca4713f5f 100644
--- a/src/pipecat/services/elevenlabs.py
+++ b/src/pipecat/services/elevenlabs.py
@@ -72,7 +72,7 @@ def calculate_word_times(
 
 class ElevenLabsTTSService(AsyncWordTTSService):
     class InputParams(BaseModel):
-        language_code: Optional[str] = None
+        language: Optional[str] = None
         output_format: Literal["pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100"] = "pcm_16000"
         optimize_streaming_latency: Optional[str] = None
         stability: Optional[float] = None
@@ -229,13 +229,13 @@ async def _connect(self):
             if self._params.optimize_streaming_latency:
                 url += f"&optimize_streaming_latency={self._params.optimize_streaming_latency}"
 
-            # language_code can only be used with the 'eleven_turbo_v2_5' model
-            if self._params.language_code:
+            # language can only be used with the 'eleven_turbo_v2_5' model
+            if self._params.language:
                 if model == "eleven_turbo_v2_5":
-                    url += f"&language_code={self._params.language_code}"
+                    url += f"&language_code={self._params.language}"
                 else:
                     logger.debug(
-                        f"Language code [{self._params.language_code}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
+                        f"Language code [{self._params.language}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
                     )
 
             self._websocket = await websockets.connect(url)

From d7555609fd752e27ea34879356613d50d4055e8b Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Fri, 27 Sep 2024 11:57:50 -0400
Subject: [PATCH 42/60] Add TTS update settings options

---
 src/pipecat/frames/frames.py        | 12 +++++-
 src/pipecat/services/ai_services.py | 62 ++++++++++++++++++++++++++++-
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index 1b31b9c88..8059b904b 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -5,7 +5,7 @@
 #
 
 from dataclasses import dataclass, field
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Optional, Tuple, Union
 
 from pipecat.clocks.base_clock import BaseClock
 from pipecat.metrics.metrics import MetricsData
@@ -548,6 +548,16 @@ class TTSUpdateSettingsFrame(ControlFrame):
     model: Optional[str] = None
     voice: Optional[str] = None
     language: Optional[Language] = None
+    speed: Optional[Union[str, float]] = None
+    emotion: Optional[List[str]] = None
+    engine: Optional[str] = None
+    pitch: Optional[str] = None
+    rate: Optional[str] = None
+    volume: Optional[str] = None
+    emphasis: Optional[str] = None
+    style: Optional[str] = None
+    style_degree: Optional[str] = None
+    role: Optional[str] = None
 
 
 @dataclass
diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index 79e52531d..1cb91d6a2 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -8,7 +8,7 @@
 import io
 import wave
 from abc import abstractmethod
-from typing import AsyncGenerator, List, Optional, Tuple
+from typing import AsyncGenerator, List, Optional, Tuple, Union
 
 from loguru import logger
 
@@ -170,6 +170,46 @@ async def set_voice(self, voice: str):
     async def set_language(self, language: Language):
         pass
 
+    @abstractmethod
+    async def set_speed(self, speed: Union[str, float]):
+        pass
+
+    @abstractmethod
+    async def set_emotion(self, emotion: List[str]):
+        pass
+
+    @abstractmethod
+    async def set_engine(self, engine: str):
+        pass
+
+    @abstractmethod
+    async def set_pitch(self, pitch: str):
+        pass
+
+    @abstractmethod
+    async def set_rate(self, rate: str):
+        pass
+
+    @abstractmethod
+    async def set_volume(self, volume: str):
+        pass
+
+    @abstractmethod
+    async def set_emphasis(self, emphasis: str):
+        pass
+
+    @abstractmethod
+    async def set_style(self, style: str):
+        pass
+
+    @abstractmethod
+    async def set_style_degree(self, style_degree: str):
+        pass
+
+    @abstractmethod
+    async def set_role(self, role: str):
+        pass
+
     # Converts the text to audio.
     @abstractmethod
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
@@ -233,6 +273,26 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
                 await self.set_voice(frame.voice)
             if frame.language is not None:
                 await self.set_language(frame.language)
+            if frame.speed is not None:
+                await self.set_speed(frame.speed)
+            if frame.emotion is not None:
+                await self.set_emotion(frame.emotion)
+            if frame.engine is not None:
+                await self.set_engine(frame.engine)
+            if frame.pitch is not None:
+                await self.set_pitch(frame.pitch)
+            if frame.rate is not None:
+                await self.set_rate(frame.rate)
+            if frame.volume is not None:
+                await self.set_volume(frame.volume)
+            if frame.emphasis is not None:
+                await self.set_emphasis(frame.emphasis)
+            if frame.style is not None:
+                await self.set_style(frame.style)
+            if frame.style_degree is not None:
+                await self.set_style_degree(frame.style_degree)
+            if frame.role is not None:
+                await self.set_role(frame.role)
         else:
             await self.push_frame(frame, direction)
 

From 1f77863aef18c1f81d181d05e03924cbdf855e2a Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Mon, 30 Sep 2024 12:45:32 -0400
Subject: [PATCH 43/60] Code review feedback

---
 src/pipecat/services/ai_services.py | 66 ++++++++++++++++-------------
 src/pipecat/services/anthropic.py   | 29 +++++++------
 src/pipecat/services/openai.py      | 33 ++++++++-------
 src/pipecat/services/together.py    | 37 ++++++++--------
 4 files changed, 90 insertions(+), 75 deletions(-)

diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index 1cb91d6a2..ba78b24f8 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -248,6 +248,34 @@ async def _push_tts_frames(self, text: str):
             # interrupted, the text is not added to the assistant context.
             await self.push_frame(TextFrame(text))
 
+    async def _update_tts_settings(self, frame: TTSUpdateSettingsFrame):
+        if frame.model is not None:
+            await self.set_model(frame.model)
+        if frame.voice is not None:
+            await self.set_voice(frame.voice)
+        if frame.language is not None:
+            await self.set_language(frame.language)
+        if frame.speed is not None:
+            await self.set_speed(frame.speed)
+        if frame.emotion is not None:
+            await self.set_emotion(frame.emotion)
+        if frame.engine is not None:
+            await self.set_engine(frame.engine)
+        if frame.pitch is not None:
+            await self.set_pitch(frame.pitch)
+        if frame.rate is not None:
+            await self.set_rate(frame.rate)
+        if frame.volume is not None:
+            await self.set_volume(frame.volume)
+        if frame.emphasis is not None:
+            await self.set_emphasis(frame.emphasis)
+        if frame.style is not None:
+            await self.set_style(frame.style)
+        if frame.style_degree is not None:
+            await self.set_style_degree(frame.style_degree)
+        if frame.role is not None:
+            await self.set_role(frame.role)
+
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
@@ -267,32 +295,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
         elif isinstance(frame, TTSSpeakFrame):
             await self._push_tts_frames(frame.text)
         elif isinstance(frame, TTSUpdateSettingsFrame):
-            if frame.model is not None:
-                await self.set_model(frame.model)
-            if frame.voice is not None:
-                await self.set_voice(frame.voice)
-            if frame.language is not None:
-                await self.set_language(frame.language)
-            if frame.speed is not None:
-                await self.set_speed(frame.speed)
-            if frame.emotion is not None:
-                await self.set_emotion(frame.emotion)
-            if frame.engine is not None:
-                await self.set_engine(frame.engine)
-            if frame.pitch is not None:
-                await self.set_pitch(frame.pitch)
-            if frame.rate is not None:
-                await self.set_rate(frame.rate)
-            if frame.volume is not None:
-                await self.set_volume(frame.volume)
-            if frame.emphasis is not None:
-                await self.set_emphasis(frame.emphasis)
-            if frame.style is not None:
-                await self.set_style(frame.style)
-            if frame.style_degree is not None:
-                await self.set_style_degree(frame.style_degree)
-            if frame.role is not None:
-                await self.set_role(frame.role)
+            await self._update_tts_settings(frame)
         else:
             await self.push_frame(frame, direction)
 
@@ -454,6 +457,12 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
         """Returns transcript as a string"""
         pass
 
+    async def _update_stt_settings(self, frame: STTUpdateSettingsFrame):
+        if frame.model is not None:
+            await self.set_model(frame.model)
+        if frame.language is not None:
+            await self.set_language(frame.language)
+
     async def process_audio_frame(self, frame: AudioRawFrame):
         await self.process_generator(self.run_stt(frame.audio))
 
@@ -466,10 +475,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             # push a TextFrame. We don't really want to push audio frames down.
             await self.process_audio_frame(frame)
         elif isinstance(frame, STTUpdateSettingsFrame):
-            if frame.model is not None:
-                await self.set_model(frame.model)
-            if frame.language is not None:
-                await self.set_language(frame.language)
+            await self._update_stt_settings(frame)
         else:
             await self.push_frame(frame, direction)
 
diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py
index 1c4cd284e..bc91e4e16 100644
--- a/src/pipecat/services/anthropic.py
+++ b/src/pipecat/services/anthropic.py
@@ -279,6 +279,21 @@ async def _process_context(self, context: OpenAILLMContext):
                 cache_read_input_tokens=cache_read_input_tokens,
             )
 
+    async def _update_settings(self, frame: LLMUpdateSettingsFrame):
+        if frame.model is not None:
+            logger.debug(f"Switching LLM model to: [{frame.model}]")
+            self.set_model_name(frame.model)
+        if frame.max_tokens is not None:
+            await self.set_max_tokens(frame.max_tokens)
+        if frame.temperature is not None:
+            await self.set_temperature(frame.temperature)
+        if frame.top_k is not None:
+            await self.set_top_k(frame.top_k)
+        if frame.top_p is not None:
+            await self.set_top_p(frame.top_p)
+        if frame.extra:
+            await self.set_extra(frame.extra)
+
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
@@ -294,19 +309,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             # to the context.
             context = AnthropicLLMContext.from_image_frame(frame)
         elif isinstance(frame, LLMUpdateSettingsFrame):
-            if frame.model is not None:
-                logger.debug(f"Switching LLM model to: [{frame.model}]")
-                self.set_model_name(frame.model)
-            if frame.max_tokens is not None:
-                await self.set_max_tokens(frame.max_tokens)
-            if frame.temperature is not None:
-                await self.set_temperature(frame.temperature)
-            if frame.top_k is not None:
-                await self.set_top_k(frame.top_k)
-            if frame.top_p is not None:
-                await self.set_top_p(frame.top_p)
-            if frame.extra:
-                await self.set_extra(frame.extra)
+            await self._update_settings(frame)
         elif isinstance(frame, LLMEnablePromptCachingFrame):
             logger.debug(f"Setting enable prompt caching to: [{frame.enable}]")
             self._enable_prompt_caching_beta = frame.enable
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index a830b65a8..f0892b9ca 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -273,6 +273,23 @@ async def _handle_function_call(self, context, tool_call_id, function_name, argu
             arguments=arguments,
         )
 
+    async def _update_settings(self, frame: LLMUpdateSettingsFrame):
+        if frame.model is not None:
+            logger.debug(f"Switching LLM model to: [{frame.model}]")
+            self.set_model_name(frame.model)
+        if frame.frequency_penalty is not None:
+            await self.set_frequency_penalty(frame.frequency_penalty)
+        if frame.presence_penalty is not None:
+            await self.set_presence_penalty(frame.presence_penalty)
+        if frame.seed is not None:
+            await self.set_seed(frame.seed)
+        if frame.temperature is not None:
+            await self.set_temperature(frame.temperature)
+        if frame.top_p is not None:
+            await self.set_top_p(frame.top_p)
+        if frame.extra:
+            await self.set_extra(frame.extra)
+
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
@@ -284,21 +301,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
         elif isinstance(frame, VisionImageRawFrame):
             context = OpenAILLMContext.from_image_frame(frame)
         elif isinstance(frame, LLMUpdateSettingsFrame):
-            if frame.model is not None:
-                logger.debug(f"Switching LLM model to: [{frame.model}]")
-                self.set_model_name(frame.model)
-            if frame.frequency_penalty is not None:
-                await self.set_frequency_penalty(frame.frequency_penalty)
-            if frame.presence_penalty is not None:
-                await self.set_presence_penalty(frame.presence_penalty)
-            if frame.seed is not None:
-                await self.set_seed(frame.seed)
-            if frame.temperature is not None:
-                await self.set_temperature(frame.temperature)
-            if frame.top_p is not None:
-                await self.set_top_p(frame.top_p)
-            if frame.extra:
-                await self.set_extra(frame.extra)
+            await self._update_settings(frame)
         else:
             await self.push_frame(frame, direction)
 
diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py
index 981aa6de2..e4068ecfc 100644
--- a/src/pipecat/services/together.py
+++ b/src/pipecat/services/together.py
@@ -128,6 +128,25 @@ async def set_extra(self, extra: Dict[str, Any]):
         logger.debug(f"Switching LLM extra to: [{extra}]")
         self._extra = extra
 
+    async def _update_settings(self, frame: LLMUpdateSettingsFrame):
+        if frame.model is not None:
+            logger.debug(f"Switching LLM model to: [{frame.model}]")
+            self.set_model_name(frame.model)
+        if frame.frequency_penalty is not None:
+            await self.set_frequency_penalty(frame.frequency_penalty)
+        if frame.max_tokens is not None:
+            await self.set_max_tokens(frame.max_tokens)
+        if frame.presence_penalty is not None:
+            await self.set_presence_penalty(frame.presence_penalty)
+        if frame.temperature is not None:
+            await self.set_temperature(frame.temperature)
+        if frame.top_k is not None:
+            await self.set_top_k(frame.top_k)
+        if frame.top_p is not None:
+            await self.set_top_p(frame.top_p)
+        if frame.extra:
+            await self.set_extra(frame.extra)
+
     async def _process_context(self, context: OpenAILLMContext):
         try:
             await self.push_frame(LLMFullResponseStartFrame())
@@ -206,23 +225,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
         elif isinstance(frame, LLMMessagesFrame):
             context = TogetherLLMContext.from_messages(frame.messages)
         elif isinstance(frame, LLMUpdateSettingsFrame):
-            if frame.model is not None:
-                logger.debug(f"Switching LLM model to: [{frame.model}]")
-                self.set_model_name(frame.model)
-            if frame.frequency_penalty is not None:
-                await self.set_frequency_penalty(frame.frequency_penalty)
-            if frame.max_tokens is not None:
-                await self.set_max_tokens(frame.max_tokens)
-            if frame.presence_penalty is not None:
-                await self.set_presence_penalty(frame.presence_penalty)
-            if frame.temperature is not None:
-                await self.set_temperature(frame.temperature)
-            if frame.top_k is not None:
-                await self.set_top_k(frame.top_k)
-            if frame.top_p is not None:
-                await self.set_top_p(frame.top_p)
-            if frame.extra:
-                await self.set_extra(frame.extra)
+            await self._update_settings(frame)
         else:
             await self.push_frame(frame, direction)
 

From ed49cebf2c32ec522f2dc9d592ca63e103382ef9 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Mon, 30 Sep 2024 15:16:46 -0400
Subject: [PATCH 44/60] Set Google TTS default language to en-US

---
 src/pipecat/services/google.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py
index 53efd8c17..519f47028 100644
--- a/src/pipecat/services/google.py
+++ b/src/pipecat/services/google.py
@@ -153,7 +153,7 @@ class InputParams(BaseModel):
         rate: Optional[str] = None
         volume: Optional[str] = None
         emphasis: Optional[Literal["strong", "moderate", "reduced", "none"]] = None
-        language: Optional[str] = None
+        language: Optional[str] = "en-US"
         gender: Optional[Literal["male", "female", "neutral"]] = None
         google_style: Optional[Literal["apologetic", "calm", "empathetic", "firm", "lively"]] = None
 

From 6b7f924af606bffd006a7e239af5be281ae9f83d Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Mon, 30 Sep 2024 14:33:08 -0700
Subject: [PATCH 45/60] tts sentence aggregation fix

---
 .../07a-interruptible-anthropic.py            | 27 ++++++++-----------
 src/pipecat/services/ai_services.py           | 12 +++++----
 src/pipecat/utils/string.py                   |  6 ++---
 3 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/examples/foundational/07a-interruptible-anthropic.py b/examples/foundational/07a-interruptible-anthropic.py
index 2bded2480..288cb1b31 100644
--- a/examples/foundational/07a-interruptible-anthropic.py
+++ b/examples/foundational/07a-interruptible-anthropic.py
@@ -5,29 +5,24 @@
 #
 
 import asyncio
-import aiohttp
 import os
 import sys
 
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
 from pipecat.frames.frames import LLMMessagesFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
-from pipecat.processors.aggregators.llm_response import (
-    LLMAssistantResponseAggregator,
-    LLMUserResponseAggregator,
-)
-from pipecat.services.cartesia import CartesiaTTSService
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.services.anthropic import AnthropicLLMService
+from pipecat.services.cartesia import CartesiaTTSService
 from pipecat.transports.services.daily import DailyParams, DailyTransport
 from pipecat.vad.silero import SileroVADAnalyzer
 
-from runner import configure
-
-from loguru import logger
-
-from dotenv import load_dotenv
-
 load_dotenv(override=True)
 
 logger.remove(0)
@@ -69,17 +64,17 @@ async def main():
             },
         ]
 
-        tma_in = LLMUserResponseAggregator(messages)
-        tma_out = LLMAssistantResponseAggregator(messages)
+        context = OpenAILLMContext(messages)
+        context_aggregator = llm.create_context_aggregator(context)
 
         pipeline = Pipeline(
             [
                 transport.input(),  # Transport user input
-                tma_in,  # User responses
+                context_aggregator.user(),  # User responses
                 llm,  # LLM
                 tts,  # TTS
                 transport.output(),  # Transport bot output
-                tma_out,  # Assistant spoken responses
+                context_aggregator.assistant(),  # Assistant spoken responses
             ]
         )
 
diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index ba78b24f8..8386fccd5 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -228,16 +228,18 @@ async def _process_text_frame(self, frame: TextFrame):
             text = frame.text
         else:
             self._current_sentence += frame.text
-            if match_endofsentence(self._current_sentence):
-                text = self._current_sentence
-                self._current_sentence = ""
+            eos_end_marker = match_endofsentence(self._current_sentence)
+            if eos_end_marker:
+                text = self._current_sentence[:eos_end_marker]
+                self._current_sentence = self._current_sentence[eos_end_marker:]
 
         if text:
             await self._push_tts_frames(text)
 
     async def _push_tts_frames(self, text: str):
-        text = text.strip()
-        if not text:
+        # Don't send only whitespace. This causes problems for some TTS models. But also don't
+        # strip all whitespace, as whitespace can influence prosody.
+        if not text.strip():
             return
 
         await self.start_processing_metrics()
diff --git a/src/pipecat/utils/string.py b/src/pipecat/utils/string.py
index cf9a22ad8..936764345 100644
--- a/src/pipecat/utils/string.py
+++ b/src/pipecat/utils/string.py
@@ -6,7 +6,6 @@
 
 import re
 
-
 ENDOFSENTENCE_PATTERN_STR = r"""
     (?<![A-Z])       # Negative lookbehind: not preceded by an uppercase letter (e.g., "U.S.A.")
     (?<!\d)          # Negative lookbehind: not preceded by a digit (e.g., "1. Let's start")
@@ -21,5 +20,6 @@
 ENDOFSENTENCE_PATTERN = re.compile(ENDOFSENTENCE_PATTERN_STR, re.VERBOSE)
 
 
-def match_endofsentence(text: str) -> bool:
-    return ENDOFSENTENCE_PATTERN.search(text.rstrip()) is not None
+def match_endofsentence(text: str) -> int:
+    match = ENDOFSENTENCE_PATTERN.search(text.rstrip())
+    return match.end() if match else 0

From c8995b82e56d3d293057e12a7a46143e4becf71b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Mon, 30 Sep 2024 10:05:56 -0700
Subject: [PATCH 46/60] all frame processors are asynchrnous

In this commit we make all frame processors asynchronous, that is, they have an
internal queue and they push frames using a task from that queue.
---
 CHANGELOG.md                                  |  22 +--
 .../foundational/05-sync-speech-and-image.py  |  10 +-
 .../05a-local-sync-speech-and-image.py        |   8 +-
 src/pipecat/processors/frame_processor.py     |  22 +--
 src/pipecat/processors/frameworks/rtvi.py     |   2 +-
 .../processors/gstreamer/pipeline_source.py   |   2 +-
 .../processors/idle_frame_processor.py        |   2 +-
 src/pipecat/processors/user_idle_processor.py |   2 +-
 src/pipecat/services/ai_services.py           | 155 ++++++++----------
 src/pipecat/services/cartesia.py              |   4 +-
 src/pipecat/services/elevenlabs.py            |   4 +-
 src/pipecat/services/gladia.py                |   2 +-
 src/pipecat/services/lmnt.py                  |   6 +-
 src/pipecat/transports/base_input.py          |   2 +-
 src/pipecat/transports/base_output.py         |   2 +-
 15 files changed, 113 insertions(+), 132 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0f489556c..b59ed56c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -48,15 +48,10 @@ async def on_connected(processor):
   frames. To achieve that, each frame processor should only output frames from a
   single task.
 
-  In this version we introduce synchronous and asynchronous frame
-  processors. The synchronous processors push output frames from the same task
-  that they receive input frames, and therefore only pushing frames from one
-  task. Asynchronous frame processors can have internal tasks to perform things
-  asynchronously (e.g. receiving data from a websocket) but they also have a
-  single task where they push frames from.
-
-  By default, frame processors are synchronous. To change a frame processor to
-  asynchronous you only need to pass `sync=False` to the base class constructor.
+  In this version all the frame processors have their own task to push
+  frames. That is, when `push_frame()` is called the given frame will be put
+  into an internal queue (with the exception of system frames) and a frame
+  processor task will push it out.
 
 - Added pipeline clocks. A pipeline clock is used by the output transport to
   know when a frame needs to be presented. For that, all frames now have an
@@ -68,9 +63,7 @@ async def on_connected(processor):
   `SystemClock`). This clock will be passed to each frame processor via the
   `StartFrame`.
 
-- Added `CartesiaHttpTTSService`. This is a synchronous frame processor
-  (i.e. given an input text frame it will wait for the whole output before
-  returning).
+- Added `CartesiaHttpTTSService`.
 
 - `DailyTransport` now supports setting the audio bitrate to improve audio
   quality through the `DailyParams.audio_out_bitrate` parameter. The new
@@ -110,8 +103,9 @@ async def on_connected(processor):
   pipelines to be executed concurrently. The difference between a
   `SyncParallelPipeline` and a `ParallelPipeline` is that, given an input frame,
   the `SyncParallelPipeline` will wait for all the internal pipelines to
-  complete. This is achieved by ensuring all the processors in each of the
-  internal pipelines are synchronous.
+  complete. This is achieved by making sure the last processor in each of the
+  pipelines is synchronous (e.g. an HTTP-based service that waits for the
+  response).
 
 - `StartFrame` is back a system frame so we make sure it's processed immediately
   by all processors. `EndFrame` stays a control frame since it needs to be
diff --git a/examples/foundational/05-sync-speech-and-image.py b/examples/foundational/05-sync-speech-and-image.py
index dae860a92..5477d0691 100644
--- a/examples/foundational/05-sync-speech-and-image.py
+++ b/examples/foundational/05-sync-speech-and-image.py
@@ -86,13 +86,13 @@ async def main():
             ),
         )
 
+        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
+
         tts = CartesiaHttpTTSService(
             api_key=os.getenv("CARTESIA_API_KEY"),
             voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
         )
 
-        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
-
         imagegen = FalImageGenService(
             params=FalImageGenService.InputParams(image_size="square_hd"),
             aiohttp_session=session,
@@ -107,8 +107,10 @@ async def main():
         # that, each pipeline runs concurrently and `SyncParallelPipeline` will
         # wait for the input frame to be processed.
         #
-        # Note that `SyncParallelPipeline` requires all processors in it to be
-        # synchronous (which is the default for most processors).
+        # Note that `SyncParallelPipeline` requires the last processor in each
+        # of the pipelines to be synchronous. In this case, we use
+        # `CartesiaHttpTTSService` and `FalImageGenService` which make HTTP
+        # requests and wait for the response.
         pipeline = Pipeline(
             [
                 llm,  # LLM
diff --git a/examples/foundational/05a-local-sync-speech-and-image.py b/examples/foundational/05a-local-sync-speech-and-image.py
index 27c36f6ce..4a561c073 100644
--- a/examples/foundational/05a-local-sync-speech-and-image.py
+++ b/examples/foundational/05a-local-sync-speech-and-image.py
@@ -82,6 +82,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
                         self.frame = OutputAudioRawFrame(
                             bytes(self.audio), frame.sample_rate, frame.num_channels
                         )
+                    await self.push_frame(frame, direction)
 
             class ImageGrabber(FrameProcessor):
                 def __init__(self):
@@ -93,6 +94,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
 
                     if isinstance(frame, URLImageRawFrame):
                         self.frame = frame
+                    await self.push_frame(frame, direction)
 
             llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
 
@@ -121,8 +123,10 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             # `SyncParallelPipeline` will wait for the input frame to be
             # processed.
             #
-            # Note that `SyncParallelPipeline` requires all processors in it to
-            # be synchronous (which is the default for most processors).
+            # Note that `SyncParallelPipeline` requires the last processor in
+            # each of the pipelines to be synchronous. In this case, we use
+            # `CartesiaHttpTTSService` and `FalImageGenService` which make HTTP
+            # requests and wait for the response.
             pipeline = Pipeline(
                 [
                     llm,  # LLM
diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py
index f71e066d7..f458f43ff 100644
--- a/src/pipecat/processors/frame_processor.py
+++ b/src/pipecat/processors/frame_processor.py
@@ -37,7 +37,6 @@ def __init__(
         *,
         name: str | None = None,
         metrics: FrameProcessorMetrics | None = None,
-        sync: bool = True,
         loop: asyncio.AbstractEventLoop | None = None,
         **kwargs,
     ):
@@ -47,7 +46,6 @@ def __init__(
         self._prev: "FrameProcessor" | None = None
         self._next: "FrameProcessor" | None = None
         self._loop: asyncio.AbstractEventLoop = loop or asyncio.get_running_loop()
-        self._sync = sync
 
         self._event_handlers: dict = {}
 
@@ -66,11 +64,8 @@ def __init__(
 
         # Every processor in Pipecat should only output frames from a single
         # task. This avoid problems like audio overlapping. System frames are
-        # the exception to this rule.
-        #
-        # This create this task.
-        if not self._sync:
-            self.__create_push_task()
+        # the exception to this rule. This create this task.
+        self.__create_push_task()
 
     @property
     def interruptions_allowed(self):
@@ -167,7 +162,7 @@ async def push_error(self, error: ErrorFrame):
         await self.push_frame(error, FrameDirection.UPSTREAM)
 
     async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
-        if self._sync or isinstance(frame, SystemFrame):
+        if isinstance(frame, SystemFrame):
             await self.__internal_push_frame(frame, direction)
         else:
             await self.__push_queue.put((frame, direction))
@@ -194,13 +189,12 @@ def _register_event_handler(self, event_name: str):
     #
 
     async def _start_interruption(self):
-        if not self._sync:
-            # Cancel the task. This will stop pushing frames downstream.
-            self.__push_frame_task.cancel()
-            await self.__push_frame_task
+        # Cancel the task. This will stop pushing frames downstream.
+        self.__push_frame_task.cancel()
+        await self.__push_frame_task
 
-            # Create a new queue and task.
-            self.__create_push_task()
+        # Create a new queue and task.
+        self.__create_push_task()
 
     async def _stop_interruption(self):
         # Nothing to do right now.
diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
index f88660f60..7a6054c3c 100644
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -516,7 +516,7 @@ def __init__(
         params: RTVIProcessorParams = RTVIProcessorParams(),
         **kwargs,
     ):
-        super().__init__(sync=False, **kwargs)
+        super().__init__(**kwargs)
         self._config = config
         self._params = params
 
diff --git a/src/pipecat/processors/gstreamer/pipeline_source.py b/src/pipecat/processors/gstreamer/pipeline_source.py
index 9f8471153..426eab50a 100644
--- a/src/pipecat/processors/gstreamer/pipeline_source.py
+++ b/src/pipecat/processors/gstreamer/pipeline_source.py
@@ -44,7 +44,7 @@ class OutputParams(BaseModel):
         clock_sync: bool = True
 
     def __init__(self, *, pipeline: str, out_params: OutputParams = OutputParams(), **kwargs):
-        super().__init__(sync=False, **kwargs)
+        super().__init__(**kwargs)
 
         self._out_params = out_params
 
diff --git a/src/pipecat/processors/idle_frame_processor.py b/src/pipecat/processors/idle_frame_processor.py
index 576cb9087..e674b6b84 100644
--- a/src/pipecat/processors/idle_frame_processor.py
+++ b/src/pipecat/processors/idle_frame_processor.py
@@ -26,7 +26,7 @@ def __init__(
         types: List[type] = [],
         **kwargs,
     ):
-        super().__init__(sync=False, **kwargs)
+        super().__init__(**kwargs)
 
         self._callback = callback
         self._timeout = timeout
diff --git a/src/pipecat/processors/user_idle_processor.py b/src/pipecat/processors/user_idle_processor.py
index 31d49cf5a..507dcb495 100644
--- a/src/pipecat/processors/user_idle_processor.py
+++ b/src/pipecat/processors/user_idle_processor.py
@@ -31,7 +31,7 @@ def __init__(
         timeout: float,
         **kwargs,
     ):
-        super().__init__(sync=False, **kwargs)
+        super().__init__(**kwargs)
 
         self._callback = callback
         self._timeout = timeout
diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index ba78b24f8..d27a3277d 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -144,6 +144,10 @@ def __init__(
         # if True, TTSService will push TextFrames and LLMFullResponseEndFrames,
         # otherwise subclass must do it
         push_text_frames: bool = True,
+        # if True, TTSService will push TTSStoppedFrames, otherwise subclass must do it
+        push_stop_frames: bool = False,
+        # if push_stop_frames is True, wait for this idle period before pushing TTSStoppedFrame
+        stop_frame_timeout_s: float = 1.0,
         # TTS output sample rate
         sample_rate: int = 16000,
         **kwargs,
@@ -151,9 +155,15 @@ def __init__(
         super().__init__(**kwargs)
         self._aggregate_sentences: bool = aggregate_sentences
         self._push_text_frames: bool = push_text_frames
-        self._current_sentence: str = ""
+        self._push_stop_frames: bool = push_stop_frames
+        self._stop_frame_timeout_s: float = stop_frame_timeout_s
         self._sample_rate: int = sample_rate
 
+        self._stop_frame_task: Optional[asyncio.Task] = None
+        self._stop_frame_queue: asyncio.Queue = asyncio.Queue()
+
+        self._current_sentence: str = ""
+
     @property
     def sample_rate(self) -> int:
         return self._sample_rate
@@ -210,13 +220,72 @@ async def set_style_degree(self, style_degree: str):
     async def set_role(self, role: str):
         pass
 
+    @abstractmethod
+    async def flush_audio(self):
+        pass
+
     # Converts the text to audio.
     @abstractmethod
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         pass
 
+    async def start(self, frame: StartFrame):
+        await super().start(frame)
+        if self._push_stop_frames:
+            self._stop_frame_task = self.get_event_loop().create_task(self._stop_frame_handler())
+
+    async def stop(self, frame: EndFrame):
+        await super().stop(frame)
+        if self._stop_frame_task:
+            self._stop_frame_task.cancel()
+            await self._stop_frame_task
+            self._stop_frame_task = None
+
+    async def cancel(self, frame: CancelFrame):
+        await super().cancel(frame)
+        if self._stop_frame_task:
+            self._stop_frame_task.cancel()
+            await self._stop_frame_task
+            self._stop_frame_task = None
+
     async def say(self, text: str):
         await self.process_frame(TextFrame(text=text), FrameDirection.DOWNSTREAM)
+        await self.flush_audio()
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, TextFrame):
+            await self._process_text_frame(frame)
+        elif isinstance(frame, StartInterruptionFrame):
+            await self._handle_interruption(frame, direction)
+        elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(frame, EndFrame):
+            sentence = self._current_sentence
+            self._current_sentence = ""
+            await self._push_tts_frames(sentence)
+            if isinstance(frame, LLMFullResponseEndFrame):
+                if self._push_text_frames:
+                    await self.push_frame(frame, direction)
+            else:
+                await self.push_frame(frame, direction)
+        elif isinstance(frame, TTSSpeakFrame):
+            await self._push_tts_frames(frame.text)
+            await self.flush_audio()
+        elif isinstance(frame, TTSUpdateSettingsFrame):
+            await self._update_tts_settings(frame)
+        else:
+            await self.push_frame(frame, direction)
+
+    async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
+        await super().push_frame(frame, direction)
+
+        if self._push_stop_frames and (
+            isinstance(frame, StartInterruptionFrame)
+            or isinstance(frame, TTSStartedFrame)
+            or isinstance(frame, TTSAudioRawFrame)
+            or isinstance(frame, TTSStoppedFrame)
+        ):
+            await self._stop_frame_queue.put(frame)
 
     async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
         self._current_sentence = ""
@@ -276,88 +345,6 @@ async def _update_tts_settings(self, frame: TTSUpdateSettingsFrame):
         if frame.role is not None:
             await self.set_role(frame.role)
 
-    async def process_frame(self, frame: Frame, direction: FrameDirection):
-        await super().process_frame(frame, direction)
-
-        if isinstance(frame, TextFrame):
-            await self._process_text_frame(frame)
-        elif isinstance(frame, StartInterruptionFrame):
-            await self._handle_interruption(frame, direction)
-        elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(frame, EndFrame):
-            sentence = self._current_sentence
-            self._current_sentence = ""
-            await self._push_tts_frames(sentence)
-            if isinstance(frame, LLMFullResponseEndFrame):
-                if self._push_text_frames:
-                    await self.push_frame(frame, direction)
-            else:
-                await self.push_frame(frame, direction)
-        elif isinstance(frame, TTSSpeakFrame):
-            await self._push_tts_frames(frame.text)
-        elif isinstance(frame, TTSUpdateSettingsFrame):
-            await self._update_tts_settings(frame)
-        else:
-            await self.push_frame(frame, direction)
-
-
-class AsyncTTSService(TTSService):
-    def __init__(
-        self,
-        # if True, TTSService will push TTSStoppedFrames, otherwise subclass must do it
-        push_stop_frames: bool = False,
-        # if push_stop_frames is True, wait for this idle period before pushing TTSStoppedFrame
-        stop_frame_timeout_s: float = 1.0,
-        **kwargs,
-    ):
-        super().__init__(sync=False, **kwargs)
-        self._push_stop_frames: bool = push_stop_frames
-        self._stop_frame_timeout_s: float = stop_frame_timeout_s
-        self._stop_frame_task: Optional[asyncio.Task] = None
-        self._stop_frame_queue: asyncio.Queue = asyncio.Queue()
-
-    @abstractmethod
-    async def flush_audio(self):
-        pass
-
-    async def say(self, text: str):
-        await super().say(text)
-        await self.flush_audio()
-
-    async def start(self, frame: StartFrame):
-        await super().start(frame)
-        if self._push_stop_frames:
-            self._stop_frame_task = self.get_event_loop().create_task(self._stop_frame_handler())
-
-    async def stop(self, frame: EndFrame):
-        await super().stop(frame)
-        if self._stop_frame_task:
-            self._stop_frame_task.cancel()
-            await self._stop_frame_task
-            self._stop_frame_task = None
-
-    async def cancel(self, frame: CancelFrame):
-        await super().cancel(frame)
-        if self._stop_frame_task:
-            self._stop_frame_task.cancel()
-            await self._stop_frame_task
-            self._stop_frame_task = None
-
-    async def process_frame(self, frame: Frame, direction: FrameDirection):
-        await super().process_frame(frame, direction)
-        if isinstance(frame, TTSSpeakFrame):
-            await self.flush_audio()
-
-    async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
-        await super().push_frame(frame, direction)
-
-        if self._push_stop_frames and (
-            isinstance(frame, StartInterruptionFrame)
-            or isinstance(frame, TTSStartedFrame)
-            or isinstance(frame, TTSAudioRawFrame)
-            or isinstance(frame, TTSStoppedFrame)
-        ):
-            await self._stop_frame_queue.put(frame)
-
     async def _stop_frame_handler(self):
         try:
             has_started = False
@@ -378,7 +365,7 @@ async def _stop_frame_handler(self):
             pass
 
 
-class AsyncWordTTSService(AsyncTTSService):
+class WordTTSService(TTSService):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self._initial_word_timestamp = -1
diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index e38d56db3..5f798b1e5 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -26,7 +26,7 @@
 )
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.transcriptions.language import Language
-from pipecat.services.ai_services import AsyncWordTTSService, TTSService
+from pipecat.services.ai_services import WordTTSService, TTSService
 
 from loguru import logger
 
@@ -61,7 +61,7 @@ def language_to_cartesia_language(language: Language) -> str | None:
     return None
 
 
-class CartesiaTTSService(AsyncWordTTSService):
+class CartesiaTTSService(WordTTSService):
     class InputParams(BaseModel):
         encoding: Optional[str] = "pcm_s16le"
         sample_rate: Optional[int] = 16000
diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py
index ca4713f5f..611f2a024 100644
--- a/src/pipecat/services/elevenlabs.py
+++ b/src/pipecat/services/elevenlabs.py
@@ -23,7 +23,7 @@
     TTSStoppedFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.ai_services import AsyncWordTTSService
+from pipecat.services.ai_services import WordTTSService
 
 # See .env.example for ElevenLabs configuration needed
 try:
@@ -70,7 +70,7 @@ def calculate_word_times(
     return word_times
 
 
-class ElevenLabsTTSService(AsyncWordTTSService):
+class ElevenLabsTTSService(WordTTSService):
     class InputParams(BaseModel):
         language: Optional[str] = None
         output_format: Literal["pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100"] = "pcm_16000"
diff --git a/src/pipecat/services/gladia.py b/src/pipecat/services/gladia.py
index 12183adde..a590d73cf 100644
--- a/src/pipecat/services/gladia.py
+++ b/src/pipecat/services/gladia.py
@@ -51,7 +51,7 @@ def __init__(
         params: InputParams = InputParams(),
         **kwargs,
     ):
-        super().__init__(sync=False, **kwargs)
+        super().__init__(**kwargs)
 
         self._api_key = api_key
         self._url = url
diff --git a/src/pipecat/services/lmnt.py b/src/pipecat/services/lmnt.py
index 1ac24d731..8f18002c5 100644
--- a/src/pipecat/services/lmnt.py
+++ b/src/pipecat/services/lmnt.py
@@ -20,7 +20,7 @@
     TTSStartedFrame,
     TTSStoppedFrame,
 )
-from pipecat.services.ai_services import AsyncTTSService
+from pipecat.services.ai_services import TTSService
 
 from loguru import logger
 
@@ -35,7 +35,7 @@
     raise Exception(f"Missing module: {e}")
 
 
-class LmntTTSService(AsyncTTSService):
+class LmntTTSService(TTSService):
     def __init__(
         self,
         *,
@@ -47,7 +47,7 @@ def __init__(
     ):
         # Let TTSService produce TTSStoppedFrames after a short delay of
         # no activity.
-        super().__init__(sync=False, push_stop_frames=True, sample_rate=sample_rate, **kwargs)
+        super().__init__(push_stop_frames=True, sample_rate=sample_rate, **kwargs)
 
         self._api_key = api_key
         self._voice_id = voice_id
diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py
index df7babff1..710f8108a 100644
--- a/src/pipecat/transports/base_input.py
+++ b/src/pipecat/transports/base_input.py
@@ -31,7 +31,7 @@
 
 class BaseInputTransport(FrameProcessor):
     def __init__(self, params: TransportParams, **kwargs):
-        super().__init__(sync=False, **kwargs)
+        super().__init__(**kwargs)
 
         self._params = params
 
diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py
index 941a3505a..c3b9c792b 100644
--- a/src/pipecat/transports/base_output.py
+++ b/src/pipecat/transports/base_output.py
@@ -43,7 +43,7 @@
 
 class BaseOutputTransport(FrameProcessor):
     def __init__(self, params: TransportParams, **kwargs):
-        super().__init__(sync=False, **kwargs)
+        super().__init__(**kwargs)
 
         self._params = params
 

From a90ebdfe7c881d5fe4d3c6850bfc556f9d1a3c04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Mon, 30 Sep 2024 11:36:59 -0700
Subject: [PATCH 47/60] syncparallelpipeline: fix now that all frames are
 asynchronous

---
 .../pipeline/sync_parallel_pipeline.py        | 66 ++++++++++++++++---
 1 file changed, 56 insertions(+), 10 deletions(-)

diff --git a/src/pipecat/pipeline/sync_parallel_pipeline.py b/src/pipecat/pipeline/sync_parallel_pipeline.py
index 854cea89d..20f4275e4 100644
--- a/src/pipecat/pipeline/sync_parallel_pipeline.py
+++ b/src/pipecat/pipeline/sync_parallel_pipeline.py
@@ -6,17 +6,25 @@
 
 import asyncio
 
+from dataclasses import dataclass
 from itertools import chain
 from typing import List
 
+from pipecat.frames.frames import ControlFrame, EndFrame, Frame, SystemFrame
 from pipecat.pipeline.base_pipeline import BasePipeline
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
-from pipecat.frames.frames import Frame
 
 from loguru import logger
 
 
+@dataclass
+class SyncFrame(ControlFrame):
+    """This frame is used to know when the internal pipelines have finished."""
+
+    pass
+
+
 class Source(FrameProcessor):
     def __init__(self, upstream_queue: asyncio.Queue):
         super().__init__()
@@ -67,13 +75,16 @@ def __init__(self, *args):
                 raise TypeError(f"SyncParallelPipeline argument {processors} is not a list")
 
             # We add a source at the beginning of the pipeline and a sink at the end.
-            source = Source(self._up_queue)
-            sink = Sink(self._down_queue)
+            up_queue = asyncio.Queue()
+            down_queue = asyncio.Queue()
+            source = Source(up_queue)
+            sink = Sink(down_queue)
             processors: List[FrameProcessor] = [source] + processors + [sink]
 
-            # Keep track of sources and sinks.
-            self._sources.append(source)
-            self._sinks.append(sink)
+            # Keep track of sources and sinks. We also keep the output queue of
+            # the source and the sinks so we can use it later.
+            self._sources.append({"processor": source, "queue": down_queue})
+            self._sinks.append({"processor": sink, "queue": up_queue})
 
             # Create pipeline
             pipeline = Pipeline(processors)
@@ -94,17 +105,52 @@ def processors_with_metrics(self) -> List[FrameProcessor]:
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
+        # The last processor of each pipeline needs to be synchronous otherwise
+        # this element won't work. Since, we know it should be synchronous we
+        # push a SyncFrame. Since frames are ordered we know this frame will be
+        # pushed after the synchronous processor has pushed its data allowing us
+        # to synchrnonize all the internal pipelines by waiting for the
+        # SyncFrame in all of them.
+        async def wait_for_sync(
+            obj, main_queue: asyncio.Queue, frame: Frame, direction: FrameDirection
+        ):
+            processor = obj["processor"]
+            queue = obj["queue"]
+
+            await processor.process_frame(frame, direction)
+
+            if isinstance(frame, (SystemFrame, EndFrame)):
+                new_frame = await queue.get()
+                if isinstance(new_frame, (SystemFrame, EndFrame)):
+                    await main_queue.put(new_frame)
+                else:
+                    while not isinstance(new_frame, (SystemFrame, EndFrame)):
+                        await main_queue.put(new_frame)
+                        queue.task_done()
+                        new_frame = await queue.get()
+            else:
+                await processor.process_frame(SyncFrame(), direction)
+                new_frame = await queue.get()
+                while not isinstance(new_frame, SyncFrame):
+                    await main_queue.put(new_frame)
+                    queue.task_done()
+                    new_frame = await queue.get()
+
         if direction == FrameDirection.UPSTREAM:
             # If we get an upstream frame we process it in each sink.
-            await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sinks])
+            await asyncio.gather(
+                *[wait_for_sync(s, self._up_queue, frame, direction) for s in self._sinks]
+            )
         elif direction == FrameDirection.DOWNSTREAM:
             # If we get a downstream frame we process it in each source.
-            await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sources])
+            await asyncio.gather(
+                *[wait_for_sync(s, self._down_queue, frame, direction) for s in self._sources]
+            )
 
         seen_ids = set()
         while not self._up_queue.empty():
             frame = await self._up_queue.get()
-            if frame and frame.id not in seen_ids:
+            if frame.id not in seen_ids:
                 await self.push_frame(frame, FrameDirection.UPSTREAM)
                 seen_ids.add(frame.id)
             self._up_queue.task_done()
@@ -112,7 +158,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
         seen_ids = set()
         while not self._down_queue.empty():
             frame = await self._down_queue.get()
-            if frame and frame.id not in seen_ids:
+            if frame.id not in seen_ids:
                 await self.push_frame(frame, FrameDirection.DOWNSTREAM)
                 seen_ids.add(frame.id)
             self._down_queue.task_done()

From d080a31a5ca322364f5645aafa3727d47ea32f64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Mon, 30 Sep 2024 11:37:26 -0700
Subject: [PATCH 48/60] tests: fix langchanin tests

---
 tests/test_langchain.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_langchain.py b/tests/test_langchain.py
index fb222205b..d30d213bd 100644
--- a/tests/test_langchain.py
+++ b/tests/test_langchain.py
@@ -7,9 +7,9 @@
 import unittest
 
 from pipecat.frames.frames import (
+    EndFrame,
     LLMFullResponseEndFrame,
     LLMFullResponseStartFrame,
-    StopTaskFrame,
     TextFrame,
     TranscriptionFrame,
     UserStartedSpeakingFrame,
@@ -32,6 +32,7 @@
 class TestLangchain(unittest.IsolatedAsyncioTestCase):
     class MockProcessor(FrameProcessor):
         def __init__(self, name):
+            super().__init__()
             self.name = name
             self.token: list[str] = []
             # Start collecting tokens when we see the start frame
@@ -55,13 +56,13 @@ async def process_frame(self, frame, direction):
     def setUp(self):
         self.expected_response = "Hello dear human"
         self.fake_llm = FakeStreamingListLLM(responses=[self.expected_response])
-        self.mock_proc = self.MockProcessor("token_collector")
 
     async def test_langchain(self):
         messages = [("system", "Say hello to {name}"), ("human", "{input}")]
         prompt = ChatPromptTemplate.from_messages(messages).partial(name="Thomas")
         chain = prompt | self.fake_llm
         proc = LangchainProcessor(chain=chain)
+        self.mock_proc = self.MockProcessor("token_collector")
 
         tma_in = LLMUserResponseAggregator(messages)
         tma_out = LLMAssistantResponseAggregator(messages)
@@ -81,7 +82,7 @@ async def test_langchain(self):
                 UserStartedSpeakingFrame(),
                 TranscriptionFrame(text="Hi World", user_id="user", timestamp="now"),
                 UserStoppedSpeakingFrame(),
-                StopTaskFrame(),
+                EndFrame(),
             ]
         )
 

From 4d1e370e02b411142ad78bb80743fd2d58efccb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Mon, 30 Sep 2024 11:56:59 -0700
Subject: [PATCH 49/60] pipeline(task): since everything is async tasks should
 wait for EndFrame

---
 src/pipecat/pipeline/task.py | 43 ++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/src/pipecat/pipeline/task.py b/src/pipecat/pipeline/task.py
index f79ff6f39..96845430d 100644
--- a/src/pipecat/pipeline/task.py
+++ b/src/pipecat/pipeline/task.py
@@ -69,6 +69,19 @@ async def _handle_upstream_frame(self, frame: Frame):
                 await self._up_queue.put(StopTaskFrame())
 
 
+class Sink(FrameProcessor):
+    def __init__(self, down_queue: asyncio.Queue):
+        super().__init__()
+        self._down_queue = down_queue
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        # We really just want to know when the EndFrame reached the sink.
+        if isinstance(frame, EndFrame):
+            await self._down_queue.put(frame)
+
+
 class PipelineTask:
     def __init__(
         self,
@@ -84,12 +97,16 @@ def __init__(
         self._params = params
         self._finished = False
 
-        self._down_queue = asyncio.Queue()
         self._up_queue = asyncio.Queue()
+        self._down_queue = asyncio.Queue()
+        self._push_queue = asyncio.Queue()
 
         self._source = Source(self._up_queue)
         self._source.link(pipeline)
 
+        self._sink = Sink(self._down_queue)
+        pipeline.link(self._sink)
+
     def has_finished(self):
         return self._finished
 
@@ -103,19 +120,19 @@ async def cancel(self):
         # out-of-band from the main streaming task which is what we want since
         # we want to cancel right away.
         await self._source.push_frame(CancelFrame())
-        self._process_down_task.cancel()
+        self._process_push_task.cancel()
         self._process_up_task.cancel()
-        await self._process_down_task
+        await self._process_push_task
         await self._process_up_task
 
     async def run(self):
         self._process_up_task = asyncio.create_task(self._process_up_queue())
-        self._process_down_task = asyncio.create_task(self._process_down_queue())
-        await asyncio.gather(self._process_up_task, self._process_down_task)
+        self._process_push_task = asyncio.create_task(self._process_push_queue())
+        await asyncio.gather(self._process_up_task, self._process_push_task)
         self._finished = True
 
     async def queue_frame(self, frame: Frame):
-        await self._down_queue.put(frame)
+        await self._push_queue.put(frame)
 
     async def queue_frames(self, frames: Iterable[Frame] | AsyncIterable[Frame]):
         if isinstance(frames, AsyncIterable):
@@ -133,7 +150,7 @@ def _initial_metrics_frame(self) -> MetricsFrame:
             data.append(ProcessingMetricsData(processor=p.name, value=0.0))
         return MetricsFrame(data=data)
 
-    async def _process_down_queue(self):
+    async def _process_push_queue(self):
         self._clock.start()
 
         start_frame = StartFrame(
@@ -154,11 +171,13 @@ async def _process_down_queue(self):
         should_cleanup = True
         while running:
             try:
-                frame = await self._down_queue.get()
+                frame = await self._push_queue.get()
                 await self._source.process_frame(frame, FrameDirection.DOWNSTREAM)
+                if isinstance(frame, EndFrame):
+                    await self._wait_for_endframe()
                 running = not (isinstance(frame, StopTaskFrame) or isinstance(frame, EndFrame))
                 should_cleanup = not isinstance(frame, StopTaskFrame)
-                self._down_queue.task_done()
+                self._push_queue.task_done()
             except asyncio.CancelledError:
                 break
         # Cleanup only if we need to.
@@ -169,6 +188,12 @@ async def _process_down_queue(self):
         self._process_up_task.cancel()
         await self._process_up_task
 
+    async def _wait_for_endframe(self):
+        # NOTE(aleix): the Sink element just pushes EndFrames to the down queue,
+        # so just wait for it. In the future we might do something else here,
+        # but for now this is fine.
+        await self._down_queue.get()
+
     async def _process_up_queue(self):
         while True:
             try:

From f8a75cede901c9f614e83c4d62ce24fc7a97a410 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Mon, 30 Sep 2024 18:22:38 -0400
Subject: [PATCH 50/60] Update daily-python to 0.11.0

---
 pyproject.toml        | 2 +-
 test-requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e31755d50..a29697bdb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,7 @@ anthropic = [ "anthropic~=0.34.0" ]
 aws = [ "boto3~=1.35.27" ]
 azure = [ "azure-cognitiveservices-speech~=1.40.0" ]
 cartesia = [ "cartesia~=1.0.13", "websockets~=12.0" ]
-daily = [ "daily-python~=0.10.1" ]
+daily = [ "daily-python~=0.11.0" ]
 deepgram = [ "deepgram-sdk~=3.5.0" ]
 elevenlabs = [ "websockets~=12.0" ]
 examples = [ "python-dotenv~=1.0.1", "flask~=3.0.3", "flask_cors~=4.0.1" ]
diff --git a/test-requirements.txt b/test-requirements.txt
index 8c7db7377..07ef45054 100644
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -2,7 +2,7 @@ aiohttp~=3.10.3
 anthropic~=0.30.0
 azure-cognitiveservices-speech~=1.40.0
 boto3~=1.35.27
-daily-python~=0.10.1
+daily-python~=0.11.0
 deepgram-sdk~=3.5.0
 fal-client~=0.4.1
 fastapi~=0.112.1

From 69c7edd60c8d586bf786f7f442e2217d13d9db42 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sat, 28 Sep 2024 21:37:03 -0700
Subject: [PATCH 51/60] pushing context frames from assistant aggregators

---
 src/pipecat/services/anthropic.py | 5 ++++-
 src/pipecat/services/openai.py    | 5 ++++-
 src/pipecat/services/together.py  | 5 ++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py
index bc91e4e16..6a535ef15 100644
--- a/src/pipecat/services/anthropic.py
+++ b/src/pipecat/services/anthropic.py
@@ -579,7 +579,7 @@ async def _push_aggregation(self):
         run_llm = False
 
         aggregation = self._aggregation
-        self._aggregation = ""
+        self._reset()
 
         try:
             if self._function_call_result:
@@ -630,5 +630,8 @@ async def _push_aggregation(self):
             if run_llm:
                 await self._user_context_aggregator.push_context_frame()
 
+            frame = OpenAILLMContextFrame(self._context)
+            await self.push_frame(frame)
+
         except Exception as e:
             logger.error(f"Error processing frame: {e}")
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index f0892b9ca..99d2d7497 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -495,7 +495,7 @@ async def _push_aggregation(self):
         run_llm = False
 
         aggregation = self._aggregation
-        self._aggregation = ""
+        self._reset()
 
         try:
             if self._function_call_result:
@@ -531,5 +531,8 @@ async def _push_aggregation(self):
             if run_llm:
                 await self._user_context_aggregator.push_context_frame()
 
+            frame = OpenAILLMContextFrame(self._context)
+            await self.push_frame(frame)
+
         except Exception as e:
             logger.error(f"Error processing frame: {e}")
diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py
index e4068ecfc..935f625ad 100644
--- a/src/pipecat/services/together.py
+++ b/src/pipecat/services/together.py
@@ -370,7 +370,7 @@ async def _push_aggregation(self):
         run_llm = False
 
         aggregation = self._aggregation
-        self._aggregation = ""
+        self._reset()
 
         try:
             if self._function_call_result:
@@ -390,5 +390,8 @@ async def _push_aggregation(self):
             if run_llm:
                 await self._user_context_aggregator.push_messages_frame()
 
+            frame = OpenAILLMContextFrame(self._context)
+            await self.push_frame(frame)
+
         except Exception as e:
             logger.error(f"Error processing frame: {e}")

From 37da7e44cdbd7ec3d580926320b4cc5edc9ae450 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sat, 28 Sep 2024 22:01:40 -0700
Subject: [PATCH 52/60] whitespace fix

---
 .../07l-interruptible-together.py             | 27 +++++++++----------
 .../processors/aggregators/llm_response.py    | 15 +++++------
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/examples/foundational/07l-interruptible-together.py b/examples/foundational/07l-interruptible-together.py
index e2cb55fed..a7086c941 100644
--- a/examples/foundational/07l-interruptible-together.py
+++ b/examples/foundational/07l-interruptible-together.py
@@ -5,29 +5,24 @@
 #
 
 import asyncio
-import aiohttp
 import os
 import sys
 
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
 from pipecat.frames.frames import LLMMessagesFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
-from pipecat.processors.aggregators.llm_response import (
-    LLMAssistantResponseAggregator,
-    LLMUserResponseAggregator,
-)
+from pipecat.services.ai_services import OpenAILLMContext
 from pipecat.services.cartesia import CartesiaTTSService
 from pipecat.services.together import TogetherLLMService
 from pipecat.transports.services.daily import DailyParams, DailyTransport
 from pipecat.vad.silero import SileroVADAnalyzer
 
-from runner import configure
-
-from loguru import logger
-
-from dotenv import load_dotenv
-
 load_dotenv(override=True)
 
 logger.remove(0)
@@ -76,17 +71,19 @@ async def main():
             },
         ]
 
-        tma_in = LLMUserResponseAggregator(messages)
-        tma_out = LLMAssistantResponseAggregator(messages)
+        context = OpenAILLMContext(messages, tools)
+        context_aggregator = llm.create_context_aggregator(context)
+        user_aggregator = context_aggregator.user()
+        assistant_aggregator = context_aggregator.assistant()
 
         pipeline = Pipeline(
             [
                 transport.input(),  # Transport user input
-                tma_in,  # User responses
+                user_aggregator,  # User responses
                 llm,  # LLM
                 tts,  # TTS
                 transport.output(),  # Transport bot output
-                tma_out,  # Assistant spoken responses
+                assistant_aggregator,  # Assistant spoken responses
             ]
         )
 
diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py
index 036f5fe47..a3cd63cbd 100644
--- a/src/pipecat/processors/aggregators/llm_response.py
+++ b/src/pipecat/processors/aggregators/llm_response.py
@@ -6,12 +6,6 @@
 
 from typing import List, Type
 
-from pipecat.processors.aggregators.openai_llm_context import (
-    OpenAILLMContextFrame,
-    OpenAILLMContext,
-)
-
-from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.frames.frames import (
     Frame,
     InterimTranscriptionFrame,
@@ -22,11 +16,16 @@
     LLMMessagesUpdateFrame,
     LLMSetToolsFrame,
     StartInterruptionFrame,
-    TranscriptionFrame,
     TextFrame,
+    TranscriptionFrame,
     UserStartedSpeakingFrame,
     UserStoppedSpeakingFrame,
 )
+from pipecat.processors.aggregators.openai_llm_context import (
+    OpenAILLMContext,
+    OpenAILLMContextFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 
 
 class LLMResponseAggregator(FrameProcessor):
@@ -111,7 +110,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             await self.push_frame(frame, direction)
         elif isinstance(frame, self._accumulator_frame):
             if self._aggregating:
-                self._aggregation += f" {frame.text}" if self._aggregation else frame.text
+                self._aggregation += frame.text if self._aggregation else frame.text
                 # We have recevied a complete sentence, so if we have seen the
                 # end frame and we were still aggregating, it means we should
                 # send the aggregation.

From ed607d5c4b30695c3d7a9a669b3ac707ec62647d Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sat, 28 Sep 2024 22:02:33 -0700
Subject: [PATCH 53/60] typo fix

---
 examples/foundational/07l-interruptible-together.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/foundational/07l-interruptible-together.py b/examples/foundational/07l-interruptible-together.py
index a7086c941..ca3386718 100644
--- a/examples/foundational/07l-interruptible-together.py
+++ b/examples/foundational/07l-interruptible-together.py
@@ -71,7 +71,7 @@ async def main():
             },
         ]
 
-        context = OpenAILLMContext(messages, tools)
+        context = OpenAILLMContext(messages)
         context_aggregator = llm.create_context_aggregator(context)
         user_aggregator = context_aggregator.user()
         assistant_aggregator = context_aggregator.assistant()

From fef393dcacf4e1c94d7a819cf3ea35722fbc8a67 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sun, 29 Sep 2024 14:11:17 -0700
Subject: [PATCH 54/60] assistant aggregator switch for space padding or not

---
 src/pipecat/processors/aggregators/llm_response.py | 10 ++++++++--
 src/pipecat/services/anthropic.py                  | 12 ++++++++----
 src/pipecat/services/openai.py                     | 12 ++++++++----
 src/pipecat/services/together.py                   | 12 ++++++++----
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py
index a3cd63cbd..4ea38b89f 100644
--- a/src/pipecat/processors/aggregators/llm_response.py
+++ b/src/pipecat/processors/aggregators/llm_response.py
@@ -39,6 +39,7 @@ def __init__(
         accumulator_frame: Type[TextFrame],
         interim_accumulator_frame: Type[TextFrame] | None = None,
         handle_interruptions: bool = False,
+        expect_stripped_words: bool = True,  # if True, need to add spaces between words
     ):
         super().__init__()
 
@@ -49,6 +50,7 @@ def __init__(
         self._accumulator_frame = accumulator_frame
         self._interim_accumulator_frame = interim_accumulator_frame
         self._handle_interruptions = handle_interruptions
+        self._expect_stripped_words = expect_stripped_words
 
         # Reset our accumulator state.
         self._reset()
@@ -110,7 +112,10 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             await self.push_frame(frame, direction)
         elif isinstance(frame, self._accumulator_frame):
             if self._aggregating:
-                self._aggregation += frame.text if self._aggregation else frame.text
+                if self._expect_stripped_words:
+                    self._aggregation += f" {frame.text}" if self._aggregation else frame.text
+                else:
+                    self._aggregation += frame.text if self._aggregation else frame.text
                 # We have recevied a complete sentence, so if we have seen the
                 # end frame and we were still aggregating, it means we should
                 # send the aggregation.
@@ -289,7 +294,7 @@ async def _push_aggregation(self):
 
 
 class LLMAssistantContextAggregator(LLMContextAggregator):
-    def __init__(self, context: OpenAILLMContext):
+    def __init__(self, context: OpenAILLMContext, *, expect_stripped_words: bool = True):
         super().__init__(
             messages=[],
             context=context,
@@ -298,6 +303,7 @@ def __init__(self, context: OpenAILLMContext):
             end_frame=LLMFullResponseEndFrame,
             accumulator_frame=TextFrame,
             handle_interruptions=True,
+            expect_stripped_words=expect_stripped_words,
         )
 
 
diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py
index 6a535ef15..86e1e3726 100644
--- a/src/pipecat/services/anthropic.py
+++ b/src/pipecat/services/anthropic.py
@@ -110,9 +110,13 @@ def enable_prompt_caching_beta(self) -> bool:
         return self._enable_prompt_caching_beta
 
     @staticmethod
-    def create_context_aggregator(context: OpenAILLMContext) -> AnthropicContextAggregatorPair:
+    def create_context_aggregator(
+        context: OpenAILLMContext, *, assistant_expect_stripped_words: bool = True
+    ) -> AnthropicContextAggregatorPair:
         user = AnthropicUserContextAggregator(context)
-        assistant = AnthropicAssistantContextAggregator(user)
+        assistant = AnthropicAssistantContextAggregator(
+            user, expect_stripped_words=assistant_expect_stripped_words
+        )
         return AnthropicContextAggregatorPair(_user=user, _assistant=assistant)
 
     async def set_enable_prompt_caching_beta(self, enable_prompt_caching_beta: bool):
@@ -541,8 +545,8 @@ async def process_frame(self, frame, direction):
 
 
 class AnthropicAssistantContextAggregator(LLMAssistantContextAggregator):
-    def __init__(self, user_context_aggregator: AnthropicUserContextAggregator):
-        super().__init__(context=user_context_aggregator._context)
+    def __init__(self, user_context_aggregator: AnthropicUserContextAggregator, **kwargs):
+        super().__init__(context=user_context_aggregator._context, **kwargs)
         self._user_context_aggregator = user_context_aggregator
         self._function_call_in_progress = None
         self._function_call_result = None
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index 99d2d7497..c17916f2d 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -336,9 +336,13 @@ def __init__(
         super().__init__(model=model, params=params, **kwargs)
 
     @staticmethod
-    def create_context_aggregator(context: OpenAILLMContext) -> OpenAIContextAggregatorPair:
+    def create_context_aggregator(
+        context: OpenAILLMContext, *, assistant_expect_stripped_words: bool = True
+    ) -> OpenAIContextAggregatorPair:
         user = OpenAIUserContextAggregator(context)
-        assistant = OpenAIAssistantContextAggregator(user)
+        assistant = OpenAIAssistantContextAggregator(
+            user, expect_stripped_words=assistant_expect_stripped_words
+        )
         return OpenAIContextAggregatorPair(_user=user, _assistant=assistant)
 
 
@@ -458,8 +462,8 @@ def __init__(self, context: OpenAILLMContext):
 
 
 class OpenAIAssistantContextAggregator(LLMAssistantContextAggregator):
-    def __init__(self, user_context_aggregator: OpenAIUserContextAggregator):
-        super().__init__(context=user_context_aggregator._context)
+    def __init__(self, user_context_aggregator: OpenAIUserContextAggregator, **kwargs):
+        super().__init__(context=user_context_aggregator._context, **kwargs)
         self._user_context_aggregator = user_context_aggregator
         self._function_call_in_progress = None
         self._function_call_result = None
diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py
index 935f625ad..3f4d97964 100644
--- a/src/pipecat/services/together.py
+++ b/src/pipecat/services/together.py
@@ -95,9 +95,13 @@ def can_generate_metrics(self) -> bool:
         return True
 
     @staticmethod
-    def create_context_aggregator(context: OpenAILLMContext) -> TogetherContextAggregatorPair:
+    def create_context_aggregator(
+        context: OpenAILLMContext, *, assistant_expect_stripped_words: bool = True
+    ) -> TogetherContextAggregatorPair:
         user = TogetherUserContextAggregator(context)
-        assistant = TogetherAssistantContextAggregator(user)
+        assistant = TogetherAssistantContextAggregator(
+            user, expect_stripped_words=assistant_expect_stripped_words
+        )
         return TogetherContextAggregatorPair(_user=user, _assistant=assistant)
 
     async def set_frequency_penalty(self, frequency_penalty: float):
@@ -331,8 +335,8 @@ async def process_frame(self, frame, direction):
 
 
 class TogetherAssistantContextAggregator(LLMAssistantContextAggregator):
-    def __init__(self, user_context_aggregator: TogetherUserContextAggregator):
-        super().__init__(context=user_context_aggregator._context)
+    def __init__(self, user_context_aggregator: TogetherUserContextAggregator, **kwargs):
+        super().__init__(context=user_context_aggregator._context, **kwargs)
         self._user_context_aggregator = user_context_aggregator
         self._function_call_in_progress = None
         self._function_call_result = None

From 539e0b66fb5abb446358c0b21ece7162fab63237 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Mon, 30 Sep 2024 09:26:06 -0700
Subject: [PATCH 55/60] small fix as per aleix

---
 src/pipecat/processors/aggregators/llm_response.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py
index 4ea38b89f..479746471 100644
--- a/src/pipecat/processors/aggregators/llm_response.py
+++ b/src/pipecat/processors/aggregators/llm_response.py
@@ -115,7 +115,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
                 if self._expect_stripped_words:
                     self._aggregation += f" {frame.text}" if self._aggregation else frame.text
                 else:
-                    self._aggregation += frame.text if self._aggregation else frame.text
+                    self._aggregation += frame.text
                 # We have recevied a complete sentence, so if we have seen the
                 # end frame and we were still aggregating, it means we should
                 # send the aggregation.

From def04ac0ce0fe6f90c66e523066c3b4517dbc8d3 Mon Sep 17 00:00:00 2001
From: JeevanReddy <jeevanreddy.narra@transformco.com>
Date: Wed, 7 Aug 2024 13:07:18 +0530
Subject: [PATCH 56/60] openai can give multiple tool calls, current
 implementation assumes only one function call at a time. Fixed this to handle
 multiple function calls.

---
 src/pipecat/services/openai.py | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index c17916f2d..b17dd7397 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -205,6 +205,10 @@ async def _stream_chat_completions(
         return chunks
 
     async def _process_context(self, context: OpenAILLMContext):
+        functions_list = []
+        arguments_list = []
+        tool_id_list = []
+        func_idx = 0
         function_name = ""
         arguments = ""
         tool_call_id = ""
@@ -242,6 +246,14 @@ async def _process_context(self, context: OpenAILLMContext):
                 # yield a frame containing the function name and the arguments.
 
                 tool_call = chunk.choices[0].delta.tool_calls[0]
+                if tool_call.index != func_idx:
+                    functions_list.append(function_name)
+                    arguments_list.append(arguments)
+                    tool_id_list.append(tool_call_id)
+                    function_name = ""
+                    arguments = ""
+                    tool_call_id = ""
+                    func_idx += 1
                 if tool_call.function and tool_call.function.name:
                     function_name += tool_call.function.name
                     tool_call_id = tool_call.id
@@ -257,12 +269,21 @@ async def _process_context(self, context: OpenAILLMContext):
         # the context, and re-prompt to get a chat answer. If we don't have a registered
         # handler, raise an exception.
         if function_name and arguments:
-            if self.has_function(function_name):
-                await self._handle_function_call(context, tool_call_id, function_name, arguments)
-            else:
-                raise OpenAIUnhandledFunctionException(
-                    f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
-                )
+            # added to the list as last function name and arguments not added to the list
+            functions_list.append(function_name)
+            arguments_list.append(arguments)
+            tool_id_list.append(tool_call_id)
+            for function_name, arguments, tool_id in zip(
+                functions_list, arguments_list, tool_id_list
+            ):
+                if self.has_function(function_name):
+                    await self._handle_function_call(context, tool_id, function_name, arguments)
+                else:
+                    raise OpenAIUnhandledFunctionException(
+                        f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
+                    )
+            # re-prompt to get a human answer after all the functions are called
+            await self._process_context(context)
 
     async def _handle_function_call(self, context, tool_call_id, function_name, arguments):
         arguments = json.loads(arguments)

From a5c73ec829685f302b3fdb8450de2ac75297b72e Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sun, 29 Sep 2024 21:03:59 -0700
Subject: [PATCH 57/60] handle openai multiple function calls

---
 examples/foundational/14-function-calling.py  | 11 ++--
 src/pipecat/frames/frames.py                  |  1 +
 .../aggregators/openai_llm_context.py         |  2 +
 src/pipecat/services/ai_services.py           | 15 +++++-
 src/pipecat/services/openai.py                | 54 +++++++++----------
 5 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/examples/foundational/14-function-calling.py b/examples/foundational/14-function-calling.py
index b5aba449c..9141029ca 100644
--- a/examples/foundational/14-function-calling.py
+++ b/examples/foundational/14-function-calling.py
@@ -34,7 +34,12 @@
 
 
 async def start_fetch_weather(function_name, llm, context):
-    await llm.push_frame(TextFrame("Let me check on that."))
+    # note: we can't push a frame to the LLM here. the bot
+    # can interrupt itself and/or cause audio overlapping glitches.
+    # possible question for Aleix and Chad about what the right way
+    # to trigger speech is, now, with the new queues/async/sync refactors.
+    # await llm.push_frame(TextFrame("Let me check on that."))
+    logger.debug(f"Starting fetch_weather_from_api with function_name: {function_name}")
 
 
 async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
@@ -106,11 +111,11 @@ async def main():
 
         pipeline = Pipeline(
             [
-                fl_in,
+                # fl_in,
                 transport.input(),
                 context_aggregator.user(),
                 llm,
-                fl_out,
+                # fl_out,
                 tts,
                 transport.output(),
                 context_aggregator.assistant(),
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index 8059b904b..f7faa8ef0 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -585,6 +585,7 @@ class FunctionCallResultFrame(DataFrame):
     tool_call_id: str
     arguments: str
     result: Any
+    run_llm: bool = True
 
 
 @dataclass
diff --git a/src/pipecat/processors/aggregators/openai_llm_context.py b/src/pipecat/processors/aggregators/openai_llm_context.py
index 83ec3e57f..4bf3f042c 100644
--- a/src/pipecat/processors/aggregators/openai_llm_context.py
+++ b/src/pipecat/processors/aggregators/openai_llm_context.py
@@ -133,6 +133,7 @@ async def call_function(
         tool_call_id: str,
         arguments: str,
         llm: FrameProcessor,
+        run_llm: bool = True,
     ) -> None:
         # Push a SystemFrame downstream. This frame will let our assistant context aggregator
         # know that we are in the middle of a function call. Some contexts/aggregators may
@@ -153,6 +154,7 @@ async def function_call_result_callback(result):
                     tool_call_id=tool_call_id,
                     arguments=arguments,
                     result=result,
+                    run_llm=run_llm,
                 )
             )
 
diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index 5eadb475b..a46ad3fab 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -110,7 +110,13 @@ def has_function(self, function_name: str):
         return function_name in self._callbacks.keys()
 
     async def call_function(
-        self, *, context: OpenAILLMContext, tool_call_id: str, function_name: str, arguments: str
+        self,
+        *,
+        context: OpenAILLMContext,
+        tool_call_id: str,
+        function_name: str,
+        arguments: str,
+        run_llm: bool,
     ) -> None:
         f = None
         if function_name in self._callbacks.keys():
@@ -120,7 +126,12 @@ async def call_function(
         else:
             return None
         await context.call_function(
-            f, function_name=function_name, tool_call_id=tool_call_id, arguments=arguments, llm=self
+            f,
+            function_name=function_name,
+            tool_call_id=tool_call_id,
+            arguments=arguments,
+            llm=self,
+            run_llm=run_llm,
         )
 
     # QUESTION FOR CB: maybe this isn't needed anymore?
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index b17dd7397..73dae4644 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -273,26 +273,21 @@ async def _process_context(self, context: OpenAILLMContext):
             functions_list.append(function_name)
             arguments_list.append(arguments)
             tool_id_list.append(tool_call_id)
-            for function_name, arguments, tool_id in zip(
-                functions_list, arguments_list, tool_id_list
+
+            total_items = len(functions_list)
+            for index, (function_name, arguments, tool_id) in enumerate(
+                zip(functions_list, arguments_list, tool_id_list), start=1
             ):
                 if self.has_function(function_name):
-                    await self._handle_function_call(context, tool_id, function_name, arguments)
-                else:
-                    raise OpenAIUnhandledFunctionException(
-                        f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
+                    run_llm = index == total_items
+                    arguments = json.loads(arguments)
+                    await self.call_function(
+                        context=context,
+                        function_name=function_name,
+                        arguments=arguments,
+                        tool_call_id=tool_id,
+                        run_llm=run_llm,
                     )
-            # re-prompt to get a human answer after all the functions are called
-            await self._process_context(context)
-
-    async def _handle_function_call(self, context, tool_call_id, function_name, arguments):
-        arguments = json.loads(arguments)
-        await self.call_function(
-            context=context,
-            tool_call_id=tool_call_id,
-            function_name=function_name,
-            arguments=arguments,
-        )
 
     async def _update_settings(self, frame: LLMUpdateSettingsFrame):
         if frame.model is not None:
@@ -486,31 +481,34 @@ class OpenAIAssistantContextAggregator(LLMAssistantContextAggregator):
     def __init__(self, user_context_aggregator: OpenAIUserContextAggregator, **kwargs):
         super().__init__(context=user_context_aggregator._context, **kwargs)
         self._user_context_aggregator = user_context_aggregator
-        self._function_call_in_progress = None
+        self._function_calls_in_progress = {}
         self._function_call_result = None
 
     async def process_frame(self, frame, direction):
         await super().process_frame(frame, direction)
         # See note above about not calling push_frame() here.
         if isinstance(frame, StartInterruptionFrame):
-            self._function_call_in_progress = None
+            self._function_calls_in_progress.clear()
             self._function_call_finished = None
+            logger.debug("clearing function calls in progress")
         elif isinstance(frame, FunctionCallInProgressFrame):
-            self._function_call_in_progress = frame
+            self._function_calls_in_progress[frame.tool_call_id] = frame
+            logger.debug(
+                f"FunctionCallInProgressFrame: {frame.tool_call_id} {self._function_calls_in_progress}"
+            )
         elif isinstance(frame, FunctionCallResultFrame):
-            if (
-                self._function_call_in_progress
-                and self._function_call_in_progress.tool_call_id == frame.tool_call_id
-            ):
-                self._function_call_in_progress = None
+            logger.debug(
+                f"FunctionCallResultFrame: {frame.tool_call_id} {self._function_calls_in_progress}"
+            )
+            if frame.tool_call_id in self._function_calls_in_progress:
+                del self._function_calls_in_progress[frame.tool_call_id]
                 self._function_call_result = frame
                 # TODO-CB: Kwin wants us to refactor this out of here but I REFUSE
                 await self._push_aggregation()
             else:
                 logger.warning(
-                    "FunctionCallResultFrame tool_call_id does not match FunctionCallInProgressFrame tool_call_id"
+                    "FunctionCallResultFrame tool_call_id does not match any function call in progress"
                 )
-                self._function_call_in_progress = None
                 self._function_call_result = None
 
     async def _push_aggregation(self):
@@ -549,7 +547,7 @@ async def _push_aggregation(self):
                             "tool_call_id": frame.tool_call_id,
                         }
                     )
-                    run_llm = True
+                    run_llm = frame.run_llm
             else:
                 self._context.add_message({"role": "assistant", "content": aggregation})
 

From 6ad3437fd2b3511b7aef91d4f9785b30ff0dc1ec Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sun, 29 Sep 2024 21:10:21 -0700
Subject: [PATCH 58/60] throw error if the llm tries to call a function that's
 not registered

---
 src/pipecat/services/openai.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index 73dae4644..8a032ea40 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -288,6 +288,10 @@ async def _process_context(self, context: OpenAILLMContext):
                         tool_call_id=tool_id,
                         run_llm=run_llm,
                     )
+                else:
+                    raise OpenAIUnhandledFunctionException(
+                        f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
+                    )
 
     async def _update_settings(self, frame: LLMUpdateSettingsFrame):
         if frame.model is not None:

From 0499fe41e455c700c2de11a1eebb84cc3f71a573 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sun, 29 Sep 2024 21:12:09 -0700
Subject: [PATCH 59/60] get rid of some debug log lines used during development

---
 src/pipecat/services/openai.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index 8a032ea40..49fd04371 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -494,16 +494,9 @@ async def process_frame(self, frame, direction):
         if isinstance(frame, StartInterruptionFrame):
             self._function_calls_in_progress.clear()
             self._function_call_finished = None
-            logger.debug("clearing function calls in progress")
         elif isinstance(frame, FunctionCallInProgressFrame):
             self._function_calls_in_progress[frame.tool_call_id] = frame
-            logger.debug(
-                f"FunctionCallInProgressFrame: {frame.tool_call_id} {self._function_calls_in_progress}"
-            )
         elif isinstance(frame, FunctionCallResultFrame):
-            logger.debug(
-                f"FunctionCallResultFrame: {frame.tool_call_id} {self._function_calls_in_progress}"
-            )
             if frame.tool_call_id in self._function_calls_in_progress:
                 del self._function_calls_in_progress[frame.tool_call_id]
                 self._function_call_result = frame

From 128355add5b80a7694e92472a24e438bbee2c3c5 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Mon, 30 Sep 2024 16:19:31 -0700
Subject: [PATCH 60/60] fix for multi-sentence tts say utterances

---
 src/pipecat/services/ai_services.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index 5eadb475b..3e39b5443 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -249,7 +249,10 @@ async def cancel(self, frame: CancelFrame):
             self._stop_frame_task = None
 
     async def say(self, text: str):
+        aggregate_sentences = self._aggregate_sentences
+        self._aggregate_sentences = False
         await self.process_frame(TextFrame(text=text), FrameDirection.DOWNSTREAM)
+        self._aggregate_sentences = aggregate_sentences
         await self.flush_audio()
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):