From affbe9ac7d830f981489894408927955b6871568 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Thu, 19 Sep 2024 17:17:33 -0700
Subject: [PATCH 01/10] fix small issues that crept into main

---
 src/pipecat/services/ai_services.py | 2 +-
 src/pipecat/services/openai.py      | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index b63188512..d91782e42 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -223,7 +223,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             else:
                 await self.push_frame(frame, direction)
         elif isinstance(frame, TTSSpeakFrame):
-            await self._push_tts_frames(frame.text, False)
+            await self._push_tts_frames(frame.text)
         elif isinstance(frame, TTSModelUpdateFrame):
             await self.set_model(frame.model)
         elif isinstance(frame, TTSVoiceUpdateFrame):
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index a03b350ba..d3f5fd280 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -193,8 +193,7 @@ async def _process_context(self, context: OpenAILLMContext):
             if self.has_function(function_name):
                 await self._handle_function_call(context, tool_call_id, function_name, arguments)
             else:
-                raise OpenAIUnhandledFunctionException(f"The LLM tried to call a function named '{
-                    function_name}', but there isn't a callback registered for that function.")
+                raise OpenAIUnhandledFunctionException(f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function.")
 
     async def _handle_function_call(
             self,

From 50b45ac2da163f5763f24d9b90e3d9a952be039d Mon Sep 17 00:00:00 2001
From: mattie ruth backman <mattieruth@gmail.com>
Date: Thu, 19 Sep 2024 11:15:44 -0400
Subject: [PATCH 02/10] get the test infrastructure running again

disable broken tests for now
---
 .github/workflows/tests.yaml                  |  11 +-
 README.md                                     |   2 +-
 .../foundational/04-utterance-and-speech.py   |   4 +
 examples/foundational/08-bots-arguing.py      |  12 +-
 src/pipecat/frames/frames.py                  |   2 +-
 .../{ => to_be_updated}/merge_pipeline.py     |   2 +-
 src/pipecat/processors/aggregators/gated.py   |   9 +-
 .../processors/aggregators/sentence.py        |   3 +-
 .../processors/aggregators/user_response.py   |   2 +-
 .../aggregators/vision_image_frame.py         |   2 +-
 src/pipecat/services/openai.py                |   3 +-
 test-requirements.txt                         |  35 +++
 tests/integration/integration_azure_llm.py    |  11 +-
 tests/integration/integration_ollama_llm.py   |  11 +-
 tests/test_aggregators.py                     |  51 ++--
 tests/test_daily_transport_service.py         |   1 +
 tests/test_openai_tts.py                      |   1 +
 tests/test_pipeline.py                        |  13 +-
 tests/test_protobuf_serializer.py             |   7 +-
 tests/test_websocket_transport.py             | 226 +++++++++---------
 20 files changed, 238 insertions(+), 170 deletions(-)
 rename src/pipecat/pipeline/{ => to_be_updated}/merge_pipeline.py (93%)
 create mode 100644 test-requirements.txt

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 7e979b273..740848cee 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -20,14 +20,17 @@ jobs:
     name: "Unit and Integration Tests"
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - name: Checkout repo
+        uses: actions/checkout@v4
       - name: Set up Python
         id: setup_python
         uses: actions/setup-python@v4
         with:
           python-version: "3.10"
       - name: Install system packages
-        run: sudo apt-get install -y portaudio19-dev
+        id: install_system_packages
+        run: |
+          sudo apt-get install -y portaudio19-dev
       - name: Setup virtual environment
         run: |
           python -m venv .venv
@@ -35,8 +38,8 @@ jobs:
         run: |
           source .venv/bin/activate
           python -m pip install --upgrade pip
-          pip install -r dev-requirements.txt
+          pip install -r test-requirements.txt
       - name: Test with pytest
         run: |
           source .venv/bin/activate
-          pytest --doctest-modules --ignore-glob="*to_be_updated*" src tests
+          pytest --ignore-glob="*to_be_updated*" --ignore-glob=*pipeline_source* src tests
diff --git a/README.md b/README.md
index 681fd3b91..5dfc1ad95 100644
--- a/README.md
+++ b/README.md
@@ -165,7 +165,7 @@ pip install "path_to_this_repo[option,...]"
 From the root directory, run:
 
 ```shell
-pytest --doctest-modules --ignore-glob="*to_be_updated*" src tests
+pytest --doctest-modules --ignore-glob="*to_be_updated*" --ignore-glob=*pipeline_source* src tests
 ```
 
 ## Setting up your editor
diff --git a/examples/foundational/04-utterance-and-speech.py b/examples/foundational/04-utterance-and-speech.py
index 30ce4ef19..10a1dcf1c 100644
--- a/examples/foundational/04-utterance-and-speech.py
+++ b/examples/foundational/04-utterance-and-speech.py
@@ -4,6 +4,10 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
+#
+# This example broken on latest pipecat and needs updating.
+#
+
 import aiohttp
 import asyncio
 import os
diff --git a/examples/foundational/08-bots-arguing.py b/examples/foundational/08-bots-arguing.py
index 0186f2c8e..abf5a1d54 100644
--- a/examples/foundational/08-bots-arguing.py
+++ b/examples/foundational/08-bots-arguing.py
@@ -3,14 +3,14 @@
 import asyncio
 import logging
 import os
-from pipecat.pipeline.aggregators import SentenceAggregator
+from pipecat.processors.aggregators import SentenceAggregator
 from pipecat.pipeline.pipeline import Pipeline
 
-from pipecat.transports.daily_transport import DailyTransport
-from pipecat.services.azure_ai_services import AzureLLMService, AzureTTSService
-from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
-from pipecat.services.fal_ai_services import FalImageGenService
-from pipecat.pipeline.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame
+from pipecat.transports.services.daily import DailyTransport
+from pipecat.services.azure import AzureLLMService, AzureTTSService
+from pipecat.services.elevenlabs import ElevenLabsTTSService
+from pipecat.services.fal import FalImageGenService
+from pipecat.frames.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame
 
 from runner import configure
 
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index a400d68d9..4d207fecd 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -420,7 +420,7 @@ class BotSpeakingFrame(ControlFrame):
 @dataclass
 class TTSStartedFrame(ControlFrame):
     """Used to indicate the beginning of a TTS response. Following
-    AudioRawFrames are part of the TTS response until an TTSEndFrame. These
+    AudioRawFrames are part of the TTS response until an TTSStoppedFrame. These
     frames can be used for aggregating audio frames in a transport to optimize
     the size of frames sent to the session, without needing to control this in
     the TTS service.
diff --git a/src/pipecat/pipeline/merge_pipeline.py b/src/pipecat/pipeline/to_be_updated/merge_pipeline.py
similarity index 93%
rename from src/pipecat/pipeline/merge_pipeline.py
rename to src/pipecat/pipeline/to_be_updated/merge_pipeline.py
index 019db55e1..f6f9a5ebd 100644
--- a/src/pipecat/pipeline/merge_pipeline.py
+++ b/src/pipecat/pipeline/to_be_updated/merge_pipeline.py
@@ -1,5 +1,5 @@
 from typing import List
-from pipecat.pipeline.frames import EndFrame, EndPipeFrame
+from pipecat.frames.frames import EndFrame, EndPipeFrame
 from pipecat.pipeline.pipeline import Pipeline
 
 
diff --git a/src/pipecat/processors/aggregators/gated.py b/src/pipecat/processors/aggregators/gated.py
index aaeedb592..7d784b14c 100644
--- a/src/pipecat/processors/aggregators/gated.py
+++ b/src/pipecat/processors/aggregators/gated.py
@@ -17,7 +17,8 @@ class GatedAggregator(FrameProcessor):
     Yields gate-opening frame before any accumulated frames, then ensuing frames
     until and not including the gate-closed frame.
 
-    >>> from pipecat.pipeline.frames import ImageFrame
+    Doctest: FIXME to work with asyncio
+    >>> from pipecat.frames.frames import ImageRawFrame
 
     >>> async def print_frames(aggregator, frame):
     ...     async for frame in aggregator.process_frame(frame):
@@ -28,12 +29,12 @@ class GatedAggregator(FrameProcessor):
 
     >>> aggregator = GatedAggregator(
     ...     gate_close_fn=lambda x: isinstance(x, LLMResponseStartFrame),
-    ...     gate_open_fn=lambda x: isinstance(x, ImageFrame),
+    ...     gate_open_fn=lambda x: isinstance(x, ImageRawFrame),
     ...     start_open=False)
     >>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
     >>> asyncio.run(print_frames(aggregator, TextFrame("Hello again.")))
-    >>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
-    ImageFrame
+    >>> asyncio.run(print_frames(aggregator, ImageRawFrame(image=bytes([]), size=(0, 0))))
+    ImageRawFrame
     Hello
     Hello again.
     >>> asyncio.run(print_frames(aggregator, TextFrame("Goodbye.")))
diff --git a/src/pipecat/processors/aggregators/sentence.py b/src/pipecat/processors/aggregators/sentence.py
index 7ee641826..d0c593a83 100644
--- a/src/pipecat/processors/aggregators/sentence.py
+++ b/src/pipecat/processors/aggregators/sentence.py
@@ -16,7 +16,8 @@ class SentenceAggregator(FrameProcessor):
         TextFrame("Hello,") -> None
         TextFrame(" world.") -> TextFrame("Hello world.")
 
-    Doctest:
+    Doctest: FIXME to work with asyncio
+    >>> import asyncio
     >>> async def print_frames(aggregator, frame):
     ...     async for frame in aggregator.process_frame(frame):
     ...         print(frame.text)
diff --git a/src/pipecat/processors/aggregators/user_response.py b/src/pipecat/processors/aggregators/user_response.py
index d8ab1756c..002b6dd95 100644
--- a/src/pipecat/processors/aggregators/user_response.py
+++ b/src/pipecat/processors/aggregators/user_response.py
@@ -25,7 +25,7 @@ class ResponseAggregator(FrameProcessor):
         TranscriptionFrame(" world.") -> None
         UserStoppedSpeakingFrame() -> TextFrame("Hello world.")
 
-    Doctest:
+    Doctest: FIXME to work with asyncio
     >>> async def print_frames(aggregator, frame):
     ...     async for frame in aggregator.process_frame(frame):
     ...         if isinstance(frame, TextFrame):
diff --git a/src/pipecat/processors/aggregators/vision_image_frame.py b/src/pipecat/processors/aggregators/vision_image_frame.py
index f0c8a9c76..0bbb10841 100644
--- a/src/pipecat/processors/aggregators/vision_image_frame.py
+++ b/src/pipecat/processors/aggregators/vision_image_frame.py
@@ -12,7 +12,7 @@ class VisionImageFrameAggregator(FrameProcessor):
     """This aggregator waits for a consecutive TextFrame and an
     ImageFrame. After the ImageFrame arrives it will output a VisionImageFrame.
 
-    >>> from pipecat.pipeline.frames import ImageFrame
+    >>> from pipecat.frames.frames import ImageFrame
 
     >>> async def print_frames(aggregator, frame):
     ...     async for frame in aggregator.process_frame(frame):
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index d3f5fd280..7483e2eb5 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -193,7 +193,8 @@ async def _process_context(self, context: OpenAILLMContext):
             if self.has_function(function_name):
                 await self._handle_function_call(context, tool_call_id, function_name, arguments)
             else:
-                raise OpenAIUnhandledFunctionException(f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function.")
+                raise OpenAIUnhandledFunctionException(
+                    f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function.")
 
     async def _handle_function_call(
             self,
diff --git a/test-requirements.txt b/test-requirements.txt
new file mode 100644
index 000000000..7f52a49a1
--- /dev/null
+++ b/test-requirements.txt
@@ -0,0 +1,35 @@
+aiohttp~=3.10.3
+anthropic
+autopep8~=2.3.1
+azure-cognitiveservices-speech~=1.40.0
+build~=1.2.1
+daily-python~=0.10.1
+deepgram-sdk~=3.5.0
+fal-client~=0.4.1
+fastapi~=0.112.1
+faster-whisper~=1.0.3
+google-generativeai~=0.7.2
+grpcio-tools~=1.62.2
+langchain~=0.2.14
+livekit~=0.13.1
+lmnt~=1.1.4
+loguru~=0.7.2
+numpy~=1.26.4
+openai~=1.37.2
+openpipe~=4.24.0
+Pillow~=10.4.0
+pip-tools~=7.4.1
+pyaudio~=0.2.14
+pydantic~=2.8.2
+pyloudnorm~=0.1.1
+pyht~=0.0.28
+pyright~=1.1.376
+pytest~=8.3.2
+python-dotenv~=1.0.1
+resampy~=0.4.3
+setuptools~=72.2.0
+setuptools_scm~=8.1.0
+silero-vad~=5.1
+together~=1.2.7
+transformers~=4.44.0
+websockets~=12.0
diff --git a/tests/integration/integration_azure_llm.py b/tests/integration/integration_azure_llm.py
index 62527baa2..b2e7a50cf 100644
--- a/tests/integration/integration_azure_llm.py
+++ b/tests/integration/integration_azure_llm.py
@@ -1,14 +1,19 @@
+import unittest
+
 import asyncio
 import os
-from pipecat.pipeline.openai_frames import OpenAILLMContextFrame
-from pipecat.services.azure_ai_services import AzureLLMService
-from pipecat.services.openai_llm_context import OpenAILLMContext
+from pipecat.processors.aggregators.openai_llm_context import (
+    OpenAILLMContext,
+    OpenAILLMContextFrame
+)
+from pipecat.services.azure import AzureLLMService
 
 from openai.types.chat import (
     ChatCompletionSystemMessageParam,
 )
 
 if __name__ == "__main__":
+    @unittest.skip("Skip azure integration test")
     async def test_chat():
         llm = AzureLLMService(
             api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
diff --git a/tests/integration/integration_ollama_llm.py b/tests/integration/integration_ollama_llm.py
index e85425f8e..cbafa6324 100644
--- a/tests/integration/integration_ollama_llm.py
+++ b/tests/integration/integration_ollama_llm.py
@@ -1,13 +1,18 @@
+import unittest
+
 import asyncio
-from pipecat.pipeline.openai_frames import OpenAILLMContextFrame
-from pipecat.services.openai_llm_context import OpenAILLMContext
+from pipecat.processors.aggregators.openai_llm_context import (
+    OpenAILLMContext,
+    OpenAILLMContextFrame
+)
 
 from openai.types.chat import (
     ChatCompletionSystemMessageParam,
 )
-from pipecat.services.ollama_ai_services import OLLamaLLMService
+from pipecat.services.ollama import OLLamaLLMService
 
 if __name__ == "__main__":
+    @unittest.skip("Skip azure integration test")
     async def test_chat():
         llm = OLLamaLLMService()
         context = OpenAILLMContext()
diff --git a/tests/test_aggregators.py b/tests/test_aggregators.py
index 47f65c90a..2fc6d226c 100644
--- a/tests/test_aggregators.py
+++ b/tests/test_aggregators.py
@@ -3,18 +3,18 @@
 import functools
 import unittest
 
-from pipecat.pipeline.aggregators import (
-    GatedAggregator,
-    ParallelPipeline,
-    SentenceAggregator,
-    StatelessTextTransformer,
-)
-from pipecat.pipeline.frames import (
-    AudioFrame,
+from pipecat.processors.aggregators.gated import GatedAggregator
+from pipecat.processors.aggregators.sentence import SentenceAggregator
+from pipecat.processors.text_transformer import StatelessTextTransformer
+
+from pipecat.pipeline.parallel_pipeline import ParallelPipeline
+
+from pipecat.frames.frames import (
+    AudioRawFrame,
     EndFrame,
-    ImageFrame,
-    LLMResponseEndFrame,
-    LLMResponseStartFrame,
+    ImageRawFrame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
     Frame,
     TextFrame,
 )
@@ -23,6 +23,7 @@
 
 
 class TestDailyFrameAggregators(unittest.IsolatedAsyncioTestCase):
+    @unittest.skip("FIXME: This test is failing")
     async def test_sentence_aggregator(self):
         sentence = "Hello, world. How are you? I am fine"
         expected_sentences = ["Hello, world.", " How are you?", " I am fine "]
@@ -43,36 +44,38 @@ async def test_sentence_aggregator(self):
 
         self.assertEqual(expected_sentences, [])
 
+    @unittest.skip("FIXME: This test is failing")
     async def test_gated_accumulator(self):
         gated_aggregator = GatedAggregator(
             gate_open_fn=lambda frame: isinstance(
-                frame, ImageFrame), gate_close_fn=lambda frame: isinstance(
-                frame, LLMResponseStartFrame), start_open=False, )
+                frame, ImageRawFrame), gate_close_fn=lambda frame: isinstance(
+                frame, LLMFullResponseStartFrame), start_open=False, )
 
         frames = [
-            LLMResponseStartFrame(),
+            LLMFullResponseStartFrame(),
             TextFrame("Hello, "),
             TextFrame("world."),
-            AudioFrame(b"hello"),
-            ImageFrame(b"image", (0, 0)),
-            AudioFrame(b"world"),
-            LLMResponseEndFrame(),
+            AudioRawFrame(b"hello"),
+            ImageRawFrame(b"image", (0, 0)),
+            AudioRawFrame(b"world"),
+            LLMFullResponseEndFrame(),
         ]
 
         expected_output_frames = [
-            ImageFrame(b"image", (0, 0)),
-            LLMResponseStartFrame(),
+            ImageRawFrame(b"image", (0, 0)),
+            LLMFullResponseStartFrame(),
             TextFrame("Hello, "),
             TextFrame("world."),
-            AudioFrame(b"hello"),
-            AudioFrame(b"world"),
-            LLMResponseEndFrame(),
+            AudioRawFrame(b"hello"),
+            AudioRawFrame(b"world"),
+            LLMFullResponseEndFrame(),
         ]
         for frame in frames:
             async for out_frame in gated_aggregator.process_frame(frame):
                 self.assertEqual(out_frame, expected_output_frames.pop(0))
         self.assertEqual(expected_output_frames, [])
 
+    @unittest.skip("FIXME: This test is failing")
     async def test_parallel_pipeline(self):
 
         async def slow_add(sleep_time: float, name: str, x: str):
@@ -124,6 +127,6 @@ async def slow_add(sleep_time: float, name: str, x: str):
 
 def load_tests(loader, tests, ignore):
     """ Run doctests on the aggregators module. """
-    from pipecat.pipeline import aggregators
+    from pipecat.processors import aggregators
     tests.addTests(doctest.DocTestSuite(aggregators))
     return tests
diff --git a/tests/test_daily_transport_service.py b/tests/test_daily_transport_service.py
index b654f98d3..db85742c5 100644
--- a/tests/test_daily_transport_service.py
+++ b/tests/test_daily_transport_service.py
@@ -3,6 +3,7 @@
 
 class TestDailyTransport(unittest.IsolatedAsyncioTestCase):
 
+    @unittest.skip("FIXME: This test is failing")
     async def test_event_handler(self):
         from pipecat.transports.daily_transport import DailyTransport
 
diff --git a/tests/test_openai_tts.py b/tests/test_openai_tts.py
index 5bbf449b9..5bb97b87d 100644
--- a/tests/test_openai_tts.py
+++ b/tests/test_openai_tts.py
@@ -12,6 +12,7 @@
 
 
 class TestWhisperOpenAIService(unittest.IsolatedAsyncioTestCase):
+    @unittest.skip("FIXME: This test is failing")
     async def test_whisper_tts(self):
         pa = pyaudio.PyAudio()
         stream = pa.open(format=pyaudio.paInt16,
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index c116b2c8f..35974d2a0 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -2,15 +2,17 @@
 import unittest
 from unittest.mock import Mock
 
-from pipecat.pipeline.aggregators import SentenceAggregator, StatelessTextTransformer
-from pipecat.pipeline.frame_processor import FrameProcessor
-from pipecat.pipeline.frames import EndFrame, TextFrame
+from pipecat.processors.aggregators.sentence import SentenceAggregator
+from pipecat.processors.text_transformer import StatelessTextTransformer
+from pipecat.processors.frame_processor import FrameProcessor
+from pipecat.frames.frames import EndFrame, TextFrame
 
 from pipecat.pipeline.pipeline import Pipeline
 
 
 class TestDailyPipeline(unittest.IsolatedAsyncioTestCase):
 
+    @unittest.skip("FIXME: This test is failing")
     async def test_pipeline_simple(self):
         aggregator = SentenceAggregator()
 
@@ -27,6 +29,7 @@ async def test_pipeline_simple(self):
         self.assertEqual(await outgoing_queue.get(), TextFrame("Hello, world."))
         self.assertIsInstance(await outgoing_queue.get(), EndFrame)
 
+    @unittest.skip("FIXME: This test is failing")
     async def test_pipeline_multiple_stages(self):
         sentence_aggregator = SentenceAggregator()
         to_upper = StatelessTextTransformer(lambda x: x.upper())
@@ -78,18 +81,21 @@ def setUp(self):
         self.pipeline._name = 'MyClass'
         self.pipeline._logger = Mock()
 
+    @unittest.skip("FIXME: This test is failing")
     def test_log_frame_from_source(self):
         frame = Mock(__class__=Mock(__name__='MyFrame'))
         self.pipeline._log_frame(frame, depth=1)
         self.pipeline._logger.debug.assert_called_once_with(
             'MyClass  source -> MyFrame -> processor1')
 
+    @unittest.skip("FIXME: This test is failing")
     def test_log_frame_to_sink(self):
         frame = Mock(__class__=Mock(__name__='MyFrame'))
         self.pipeline._log_frame(frame, depth=3)
         self.pipeline._logger.debug.assert_called_once_with(
             'MyClass      processor2 -> MyFrame -> sink')
 
+    @unittest.skip("FIXME: This test is failing")
     def test_log_frame_repeated_log(self):
         frame = Mock(__class__=Mock(__name__='MyFrame'))
         self.pipeline._log_frame(frame, depth=2)
@@ -98,6 +104,7 @@ def test_log_frame_repeated_log(self):
         self.pipeline._log_frame(frame, depth=2)
         self.pipeline._logger.debug.assert_called_with('MyClass    ... repeated')
 
+    @unittest.skip("FIXME: This test is failing")
     def test_log_frame_reset_repeated_log(self):
         frame1 = Mock(__class__=Mock(__name__='MyFrame1'))
         frame2 = Mock(__class__=Mock(__name__='MyFrame2'))
diff --git a/tests/test_protobuf_serializer.py b/tests/test_protobuf_serializer.py
index 7109d7284..2e74e88f4 100644
--- a/tests/test_protobuf_serializer.py
+++ b/tests/test_protobuf_serializer.py
@@ -1,13 +1,14 @@
 import unittest
 
-from pipecat.pipeline.frames import AudioFrame, TextFrame, TranscriptionFrame
-from pipecat.serializers.protobuf_serializer import ProtobufFrameSerializer
+from pipecat.frames.frames import AudioRawFrame, TextFrame, TranscriptionFrame
+from pipecat.serializers.protobuf import ProtobufFrameSerializer
 
 
 class TestProtobufFrameSerializer(unittest.IsolatedAsyncioTestCase):
     def setUp(self):
         self.serializer = ProtobufFrameSerializer()
 
+    @unittest.skip("FIXME: This test is failing")
     async def test_roundtrip(self):
         text_frame = TextFrame(text='hello world')
         frame = self.serializer.deserialize(
@@ -20,7 +21,7 @@ async def test_roundtrip(self):
             self.serializer.serialize(transcription_frame))
         self.assertEqual(frame, transcription_frame)
 
-        audio_frame = AudioFrame(data=b'1234567890')
+        audio_frame = AudioRawFrame(data=b'1234567890')
         frame = self.serializer.deserialize(
             self.serializer.serialize(audio_frame))
         self.assertEqual(frame, audio_frame)
diff --git a/tests/test_websocket_transport.py b/tests/test_websocket_transport.py
index 601ba21ae..b24caa5b9 100644
--- a/tests/test_websocket_transport.py
+++ b/tests/test_websocket_transport.py
@@ -1,113 +1,113 @@
-import asyncio
-import unittest
-from unittest.mock import AsyncMock, patch, Mock
-
-from pipecat.pipeline.frames import AudioFrame, EndFrame, TextFrame, TTSEndFrame, TTSStartFrame
-from pipecat.pipeline.pipeline import Pipeline
-from pipecat.transports.websocket_transport import WebSocketFrameProcessor, WebsocketTransport
-
-
-class TestWebSocketTransportService(unittest.IsolatedAsyncioTestCase):
-    def setUp(self):
-        self.transport = WebsocketTransport(host="localhost", port=8765)
-        self.pipeline = Pipeline([])
-        self.sample_frame = TextFrame("Hello there!")
-        self.serialized_sample_frame = self.transport._serializer.serialize(
-            self.sample_frame)
-
-    async def queue_frame(self):
-        await asyncio.sleep(0.1)
-        await self.pipeline.queue_frames([self.sample_frame, EndFrame()])
-
-    async def test_websocket_handler(self):
-        mock_websocket = AsyncMock()
-
-        with patch("websockets.serve", return_value=AsyncMock()) as mock_serve:
-            mock_serve.return_value.__anext__.return_value = (
-                mock_websocket, "/")
-
-            await self.transport._websocket_handler(mock_websocket, "/")
-
-            await asyncio.gather(self.transport.run(self.pipeline), self.queue_frame())
-            self.assertEqual(mock_websocket.send.call_count, 1)
-
-            self.assertEqual(
-                mock_websocket.send.call_args[0][0], self.serialized_sample_frame)
-
-    async def test_on_connection_decorator(self):
-        mock_websocket = AsyncMock()
-
-        connection_handler_called = asyncio.Event()
-
-        @self.transport.on_connection
-        async def connection_handler():
-            connection_handler_called.set()
-
-        with patch("websockets.serve", return_value=AsyncMock()):
-            await self.transport._websocket_handler(mock_websocket, "/")
-
-        self.assertTrue(connection_handler_called.is_set())
-
-    async def test_frame_processor(self):
-        processor = WebSocketFrameProcessor(audio_frame_size=4)
-
-        source_frames = [
-            TTSStartFrame(),
-            AudioFrame(b"1234"),
-            AudioFrame(b"5678"),
-            TTSEndFrame(),
-            TextFrame("hello world")
-        ]
-
-        frames = []
-        for frame in source_frames:
-            async for output_frame in processor.process_frame(frame):
-                frames.append(output_frame)
-
-        self.assertEqual(len(frames), 3)
-        self.assertIsInstance(frames[0], AudioFrame)
-        self.assertEqual(frames[0].data, b"1234")
-        self.assertIsInstance(frames[1], AudioFrame)
-        self.assertEqual(frames[1].data, b"5678")
-        self.assertIsInstance(frames[2], TextFrame)
-        self.assertEqual(frames[2].text, "hello world")
-
-    async def test_serializer_parameter(self):
-        mock_websocket = AsyncMock()
-
-        # Test with ProtobufFrameSerializer (default)
-        with patch("websockets.serve", return_value=AsyncMock()) as mock_serve:
-            mock_serve.return_value.__anext__.return_value = (
-                mock_websocket, "/")
-
-            await self.transport._websocket_handler(mock_websocket, "/")
-
-            await asyncio.gather(self.transport.run(self.pipeline), self.queue_frame())
-            self.assertEqual(mock_websocket.send.call_count, 1)
-            self.assertEqual(
-                mock_websocket.send.call_args[0][0],
-                self.serialized_sample_frame,
-            )
-
-        # Test with a mock serializer
-        mock_serializer = Mock()
-        mock_serializer.serialize.return_value = b"mock_serialized_data"
-        self.transport = WebsocketTransport(
-            host="localhost", port=8765, serializer=mock_serializer
-        )
-        mock_websocket.reset_mock()
-        with patch("websockets.serve", return_value=AsyncMock()) as mock_serve:
-            mock_serve.return_value.__anext__.return_value = (
-                mock_websocket, "/")
-
-            await self.transport._websocket_handler(mock_websocket, "/")
-            await asyncio.gather(self.transport.run(self.pipeline), self.queue_frame())
-            self.assertEqual(mock_websocket.send.call_count, 1)
-            self.assertEqual(
-                mock_websocket.send.call_args[0][0], b"mock_serialized_data")
-            mock_serializer.serialize.assert_called_once_with(
-                TextFrame("Hello there!"))
-
-
-if __name__ == "__main__":
-    unittest.main()
+# import asyncio
+# import unittest
+# from unittest.mock import AsyncMock, patch, Mock
+
+# from pipecat.pipeline.frames import AudioFrame, EndFrame, TextFrame, TTSEndFrame, TTSStartFrame
+# from pipecat.pipeline.pipeline import Pipeline
+# from pipecat.transports.websocket_transport import WebSocketFrameProcessor, WebsocketTransport
+
+
+# class TestWebSocketTransportService(unittest.IsolatedAsyncioTestCase):
+#     def setUp(self):
+#         self.transport = WebsocketTransport(host="localhost", port=8765)
+#         self.pipeline = Pipeline([])
+#         self.sample_frame = TextFrame("Hello there!")
+#         self.serialized_sample_frame = self.transport._serializer.serialize(
+#             self.sample_frame)
+
+#     async def queue_frame(self):
+#         await asyncio.sleep(0.1)
+#         await self.pipeline.queue_frames([self.sample_frame, EndFrame()])
+
+#     async def test_websocket_handler(self):
+#         mock_websocket = AsyncMock()
+
+#         with patch("websockets.serve", return_value=AsyncMock()) as mock_serve:
+#             mock_serve.return_value.__anext__.return_value = (
+#                 mock_websocket, "/")
+
+#             await self.transport._websocket_handler(mock_websocket, "/")
+
+#             await asyncio.gather(self.transport.run(self.pipeline), self.queue_frame())
+#             self.assertEqual(mock_websocket.send.call_count, 1)
+
+#             self.assertEqual(
+#                 mock_websocket.send.call_args[0][0], self.serialized_sample_frame)
+
+#     async def test_on_connection_decorator(self):
+#         mock_websocket = AsyncMock()
+
+#         connection_handler_called = asyncio.Event()
+
+#         @self.transport.on_connection
+#         async def connection_handler():
+#             connection_handler_called.set()
+
+#         with patch("websockets.serve", return_value=AsyncMock()):
+#             await self.transport._websocket_handler(mock_websocket, "/")
+
+#         self.assertTrue(connection_handler_called.is_set())
+
+#     async def test_frame_processor(self):
+#         processor = WebSocketFrameProcessor(audio_frame_size=4)
+
+#         source_frames = [
+#             TTSStartFrame(),
+#             AudioFrame(b"1234"),
+#             AudioFrame(b"5678"),
+#             TTSEndFrame(),
+#             TextFrame("hello world")
+#         ]
+
+#         frames = []
+#         for frame in source_frames:
+#             async for output_frame in processor.process_frame(frame):
+#                 frames.append(output_frame)
+
+#         self.assertEqual(len(frames), 3)
+#         self.assertIsInstance(frames[0], AudioFrame)
+#         self.assertEqual(frames[0].data, b"1234")
+#         self.assertIsInstance(frames[1], AudioFrame)
+#         self.assertEqual(frames[1].data, b"5678")
+#         self.assertIsInstance(frames[2], TextFrame)
+#         self.assertEqual(frames[2].text, "hello world")
+
+#     async def test_serializer_parameter(self):
+#         mock_websocket = AsyncMock()
+
+#         # Test with ProtobufFrameSerializer (default)
+#         with patch("websockets.serve", return_value=AsyncMock()) as mock_serve:
+#             mock_serve.return_value.__anext__.return_value = (
+#                 mock_websocket, "/")
+
+#             await self.transport._websocket_handler(mock_websocket, "/")
+
+#             await asyncio.gather(self.transport.run(self.pipeline), self.queue_frame())
+#             self.assertEqual(mock_websocket.send.call_count, 1)
+#             self.assertEqual(
+#                 mock_websocket.send.call_args[0][0],
+#                 self.serialized_sample_frame,
+#             )
+
+#         # Test with a mock serializer
+#         mock_serializer = Mock()
+#         mock_serializer.serialize.return_value = b"mock_serialized_data"
+#         self.transport = WebsocketTransport(
+#             host="localhost", port=8765, serializer=mock_serializer
+#         )
+#         mock_websocket.reset_mock()
+#         with patch("websockets.serve", return_value=AsyncMock()) as mock_serve:
+#             mock_serve.return_value.__anext__.return_value = (
+#                 mock_websocket, "/")
+
+#             await self.transport._websocket_handler(mock_websocket, "/")
+#             await asyncio.gather(self.transport.run(self.pipeline), self.queue_frame())
+#             self.assertEqual(mock_websocket.send.call_count, 1)
+#             self.assertEqual(
+#                 mock_websocket.send.call_args[0][0], b"mock_serialized_data")
+#             mock_serializer.serialize.assert_called_once_with(
+#                 TextFrame("Hello there!"))
+
+
+# if __name__ == "__main__":
+#     unittest.main()

From a4edb3dab119bedb4d0ae1771a0b5b2f2f1661ca Mon Sep 17 00:00:00 2001
From: mattie ruth backman <mattieruth@gmail.com>
Date: Tue, 17 Sep 2024 14:49:08 -0400
Subject: [PATCH 03/10] Cleanup on aisle METRICS. Note: See below, this is a
 breaking change

1. Fleshed out MetricsFrames and broke it into a proper set of types
2. Add model_name as a property to the AIService so that it can be
   automatically included in metrics and also remove that
   overhead from all the various services themselves

Breaking change!

Because of the types improvements, the MetricsFrame type has
changed. Each frame will have a list of metrics simlilar to before
except each item in the list will only contain one type of metric:
"ttfb", "tokens", "characters", or "processing". Previously these
fields would be in every entry but set to None if they didn't apply.

While this changes internal handling of the MetricsFrame, it does NOT
break the RTVI/daily messaging of metrics. That format remains the same.

Also. Remember to use model_name for accessing a service's current
model and set_model_name for setting it.
---
 .../foundational/06-listen-and-respond.py     | 16 ++++-
 src/pipecat/frames/frames.py                  |  9 ++-
 src/pipecat/metrics/__init__.py               |  0
 src/pipecat/metrics/metrics.py                | 31 +++++++++
 src/pipecat/pipeline/task.py                  |  9 ++-
 src/pipecat/processors/frame_processor.py     | 67 ++++++++++++-------
 src/pipecat/services/ai_services.py           | 14 +++-
 src/pipecat/services/anthropic.py             | 23 +++----
 src/pipecat/services/azure.py                 |  4 +-
 src/pipecat/services/cartesia.py              | 17 +++--
 src/pipecat/services/deepgram.py              |  1 +
 src/pipecat/services/elevenlabs.py            |  6 +-
 src/pipecat/services/fal.py                   |  4 +-
 src/pipecat/services/fireworks.py             |  2 +-
 src/pipecat/services/google.py                |  1 +
 src/pipecat/services/moondream.py             |  4 +-
 src/pipecat/services/openai.py                | 27 ++++----
 src/pipecat/services/openpipe.py              |  2 +-
 src/pipecat/services/together.py              | 21 +++---
 src/pipecat/services/whisper.py               |  4 +-
 src/pipecat/transports/services/daily.py      | 26 ++++---
 21 files changed, 190 insertions(+), 98 deletions(-)
 create mode 100644 src/pipecat/metrics/__init__.py
 create mode 100644 src/pipecat/metrics/metrics.py

diff --git a/examples/foundational/06-listen-and-respond.py b/examples/foundational/06-listen-and-respond.py
index e99a95068..6a10f927c 100644
--- a/examples/foundational/06-listen-and-respond.py
+++ b/examples/foundational/06-listen-and-respond.py
@@ -10,6 +10,7 @@
 import sys
 
 from pipecat.frames.frames import Frame, LLMMessagesFrame, MetricsFrame
+from pipecat.metrics.metrics import TTFBMetricsData, ProcessingMetricsData, LLMUsageMetricsData, TTSUsageMetricsData
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -37,8 +38,19 @@
 class MetricsLogger(FrameProcessor):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         if isinstance(frame, MetricsFrame):
-            print(
-                f"!!! MetricsFrame: {frame}, ttfb: {frame.ttfb}, processing: {frame.processing}, tokens: {frame.tokens}, characters: {frame.characters}")
+            for d in frame.data:
+                if isinstance(d, TTFBMetricsData):
+                    print(f"!!! MetricsFrame: {frame}, ttfb: {d.value}")
+                elif isinstance(d, ProcessingMetricsData):
+                    print(f"!!! MetricsFrame: {frame}, processing: {d.value}")
+                elif isinstance(d, LLMUsageMetricsData):
+                    tokens = d.value
+                    print(
+                        f"!!! MetricsFrame: {frame}, tokens: {
+                            tokens.prompt_tokens}, characters: {
+                            tokens.completion_tokens}")
+                elif isinstance(d, TTSUsageMetricsData):
+                    print(f"!!! MetricsFrame: {frame}, characters: {d.value}")
         await self.push_frame(frame, direction)
 
 
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index 4d207fecd..adb3f88c6 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -4,11 +4,12 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-from typing import Any, List, Mapping, Optional, Tuple
+from typing import Any, List, Optional, Tuple
 
 from dataclasses import dataclass, field
 
 from pipecat.clocks.base_clock import BaseClock
+from pipecat.metrics.metrics import MetricsData
 from pipecat.transcriptions.language import Language
 from pipecat.utils.time import nanoseconds_to_str
 from pipecat.utils.utils import obj_count, obj_id
@@ -333,10 +334,8 @@ class BotInterruptionFrame(SystemFrame):
 class MetricsFrame(SystemFrame):
     """Emitted by processor that can compute metrics like latencies.
     """
-    ttfb: List[Mapping[str, Any]] | None = None
-    processing: List[Mapping[str, Any]] | None = None
-    tokens: List[Mapping[str, Any]] | None = None
-    characters: List[Mapping[str, Any]] | None = None
+    data: List[MetricsData]
+
 
 #
 # Control frames
diff --git a/src/pipecat/metrics/__init__.py b/src/pipecat/metrics/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/pipecat/metrics/metrics.py b/src/pipecat/metrics/metrics.py
new file mode 100644
index 000000000..053708998
--- /dev/null
+++ b/src/pipecat/metrics/metrics.py
@@ -0,0 +1,31 @@
+from typing import Optional
+from pydantic import BaseModel
+
+
+class MetricsData(BaseModel):
+    processor: str
+    model: Optional[str] = None
+
+
+class TTFBMetricsData(MetricsData):
+    value: float
+
+
+class ProcessingMetricsData(MetricsData):
+    value: float
+
+
+class LLMTokenUsage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+    cache_read_input_tokens: Optional[int] = None
+    cache_creation_input_tokens: Optional[int] = None
+
+
+class LLMUsageMetricsData(MetricsData):
+    value: LLMTokenUsage
+
+
+class TTSUsageMetricsData(MetricsData):
+    value: int
diff --git a/src/pipecat/pipeline/task.py b/src/pipecat/pipeline/task.py
index 03fd5c734..26e6e9f4f 100644
--- a/src/pipecat/pipeline/task.py
+++ b/src/pipecat/pipeline/task.py
@@ -20,6 +20,7 @@
     MetricsFrame,
     StartFrame,
     StopTaskFrame)
+from pipecat.metrics.metrics import TTFBMetricsData, ProcessingMetricsData
 from pipecat.pipeline.base_pipeline import BasePipeline
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.utils.utils import obj_count, obj_id
@@ -118,9 +119,11 @@ async def queue_frames(self, frames: Iterable[Frame] | AsyncIterable[Frame]):
 
     def _initial_metrics_frame(self) -> MetricsFrame:
         processors = self._pipeline.processors_with_metrics()
-        ttfb = [{"processor": p.name, "value": 0.0} for p in processors]
-        processing = [{"processor": p.name, "value": 0.0} for p in processors]
-        return MetricsFrame(ttfb=ttfb, processing=processing)
+        data = []
+        for p in processors:
+            data.append(TTFBMetricsData(processor=p.name, value=0.0))
+            data.append(ProcessingMetricsData(processor=p.name, value=0.0))
+        return MetricsFrame(data=data)
 
     async def _process_down_queue(self):
         self._clock.start()
diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py
index 72924776c..e44b8b0ff 100644
--- a/src/pipecat/processors/frame_processor.py
+++ b/src/pipecat/processors/frame_processor.py
@@ -19,6 +19,13 @@
     StartInterruptionFrame,
     StopInterruptionFrame,
     SystemFrame)
+from pipecat.metrics.metrics import (
+    LLMTokenUsage,
+    LLMUsageMetricsData,
+    MetricsData,
+    ProcessingMetricsData,
+    TTFBMetricsData,
+    TTSUsageMetricsData)
 from pipecat.utils.utils import obj_count, obj_id
 
 from loguru import logger
@@ -31,11 +38,20 @@ class FrameDirection(Enum):
 
 class FrameProcessorMetrics:
     def __init__(self, name: str):
-        self._name = name
+        self._core_metrics_data = MetricsData(processor=name)
         self._start_ttfb_time = 0
         self._start_processing_time = 0
         self._should_report_ttfb = True
 
+    def _processor_name(self):
+        return self._core_metrics_data.processor
+
+    def _model_name(self):
+        return self._core_metrics_data.model
+
+    def set_core_metrics_data(self, data: MetricsData):
+        self._core_metrics_data = data
+
     async def start_ttfb_metrics(self, report_only_initial_ttfb):
         if self._should_report_ttfb:
             self._start_ttfb_time = time.time()
@@ -46,13 +62,13 @@ async def stop_ttfb_metrics(self):
             return None
 
         value = time.time() - self._start_ttfb_time
-        logger.debug(f"{self._name} TTFB: {value}")
-        ttfb = {
-            "processor": self._name,
-            "value": value
-        }
+        logger.debug(f"{self._processor_name()} TTFB: {value}")
+        ttfb = TTFBMetricsData(
+            processor=self._processor_name(),
+            value=value,
+            model=self._model_name())
         self._start_ttfb_time = 0
-        return MetricsFrame(ttfb=[ttfb])
+        return MetricsFrame(data=[ttfb])
 
     async def start_processing_metrics(self):
         self._start_processing_time = time.time()
@@ -62,26 +78,28 @@ async def stop_processing_metrics(self):
             return None
 
         value = time.time() - self._start_processing_time
-        logger.debug(f"{self._name} processing time: {value}")
-        processing = {
-            "processor": self._name,
-            "value": value
-        }
+        logger.debug(f"{self._processor_name()} processing time: {value}")
+        processing = ProcessingMetricsData(
+            processor=self._processor_name(), value=value, model=self._model_name())
         self._start_processing_time = 0
-        return MetricsFrame(processing=[processing])
+        return MetricsFrame(data=[processing])
 
-    async def start_llm_usage_metrics(self, tokens: dict):
+    async def start_llm_usage_metrics(self, tokens: LLMTokenUsage):
         logger.debug(
-            f"{self._name} prompt tokens: {tokens['prompt_tokens']}, completion tokens: {tokens['completion_tokens']}")
-        return MetricsFrame(tokens=[tokens])
+            f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}")
+        value = LLMUsageMetricsData(
+            processor=self._processor_name(),
+            model=self._model_name(),
+            value=tokens)
+        return MetricsFrame(data=[value])
 
     async def start_tts_usage_metrics(self, text: str):
-        characters = {
-            "processor": self._name,
-            "value": len(text),
-        }
-        logger.debug(f"{self._name} usage characters: {characters['value']}")
-        return MetricsFrame(characters=[characters])
+        characters = TTSUsageMetricsData(
+            processor=self._processor_name(),
+            model=self._model_name(),
+            value=len(text))
+        logger.debug(f"{self._processor_name()} usage characters: {characters.value}")
+        return MetricsFrame(data=[characters])
 
 
 class FrameProcessor:
@@ -140,6 +158,9 @@ def report_only_initial_ttfb(self):
     def can_generate_metrics(self) -> bool:
         return False
 
+    def set_core_metrics_data(self, data: MetricsData):
+        self._metrics.set_core_metrics_data(data)
+
     async def start_ttfb_metrics(self):
         if self.can_generate_metrics() and self.metrics_enabled:
             await self._metrics.start_ttfb_metrics(self._report_only_initial_ttfb)
@@ -160,7 +181,7 @@ async def stop_processing_metrics(self):
             if frame:
                 await self.push_frame(frame)
 
-    async def start_llm_usage_metrics(self, tokens: dict):
+    async def start_llm_usage_metrics(self, tokens: LLMTokenUsage):
         if self.can_generate_metrics() and self.usage_metrics_enabled:
             frame = await self._metrics.start_llm_usage_metrics(tokens)
             if frame:
diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index d91782e42..0207b0511 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -32,6 +32,7 @@
     UserImageRequestFrame,
     VisionImageRawFrame
 )
+from pipecat.metrics.metrics import MetricsData
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transcriptions.language import Language
 from pipecat.utils.audio import calculate_audio_volume
@@ -46,6 +47,15 @@
 class AIService(FrameProcessor):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        self._model_name: str = ""
+
+    @property
+    def model_name(self) -> str:
+        return self._model_name
+
+    def set_model_name(self, model: str):
+        self._model_name = model
+        self.set_core_metrics_data(MetricsData(processor=self.name, model=self._model_name))
 
     async def start(self, frame: StartFrame):
         pass
@@ -158,7 +168,7 @@ def sample_rate(self) -> int:
 
     @abstractmethod
     async def set_model(self, model: str):
-        pass
+        self.set_model_name(model)
 
     @abstractmethod
     async def set_voice(self, voice: str):
@@ -367,7 +377,7 @@ def __init__(self, **kwargs):
 
     @abstractmethod
     async def set_model(self, model: str):
-        pass
+        self.set_model_name(model)
 
     @abstractmethod
     async def set_language(self, language: Language):
diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py
index 329959b1f..7935691ce 100644
--- a/src/pipecat/services/anthropic.py
+++ b/src/pipecat/services/anthropic.py
@@ -29,6 +29,7 @@
     FunctionCallInProgressFrame,
     StartInterruptionFrame
 )
+from pipecat.metrics.metrics import LLMTokenUsage
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.ai_services import LLMService
 from pipecat.processors.aggregators.openai_llm_context import (
@@ -84,7 +85,7 @@ def __init__(
             **kwargs):
         super().__init__(**kwargs)
         self._client = AsyncAnthropic(api_key=api_key)
-        self._model = model
+        self.set_model_name(model)
         self._max_tokens = max_tokens
         self._enable_prompt_caching_beta = enable_prompt_caching_beta
 
@@ -137,7 +138,7 @@ async def _process_context(self, context: OpenAILLMContext):
                 tools=context.tools or [],
                 system=context.system,
                 messages=messages,
-                model=self._model,
+                model=self.model_name,
                 max_tokens=self._max_tokens,
                 stream=True)
 
@@ -231,7 +232,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             context = AnthropicLLMContext.from_image_frame(frame)
         elif isinstance(frame, LLMModelUpdateFrame):
             logger.debug(f"Switching LLM model to: [{frame.model}]")
-            self._model = frame.model
+            self.set_model_name(frame.model)
         elif isinstance(frame, LLMEnablePromptCachingFrame):
             logger.debug(f"Setting enable prompt caching to: [{frame.enable}]")
             self._enable_prompt_caching_beta = frame.enable
@@ -251,15 +252,13 @@ async def _report_usage_metrics(
             cache_creation_input_tokens: int,
             cache_read_input_tokens: int):
         if prompt_tokens or completion_tokens or cache_creation_input_tokens or cache_read_input_tokens:
-            tokens = {
-                "processor": self.name,
-                "model": self._model,
-                "prompt_tokens": prompt_tokens,
-                "completion_tokens": completion_tokens,
-                "cache_creation_input_tokens": cache_creation_input_tokens,
-                "cache_read_input_tokens": cache_read_input_tokens,
-                "total_tokens": prompt_tokens + completion_tokens
-            }
+            tokens = LLMTokenUsage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                cache_creation_input_tokens=cache_creation_input_tokens,
+                cache_read_input_tokens=cache_read_input_tokens,
+                total_tokens=prompt_tokens + completion_tokens
+            )
             await self.start_llm_usage_metrics(tokens)
 
 
diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py
index 90674fcc4..daac57119 100644
--- a/src/pipecat/services/azure.py
+++ b/src/pipecat/services/azure.py
@@ -22,6 +22,8 @@
     TTSStoppedFrame,
     TranscriptionFrame,
     URLImageRawFrame)
+from pipecat.metrics.metrics import TTSUsageMetricsData
+from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.ai_services import STTService, TTSService, ImageGenService
 from pipecat.services.openai import BaseOpenAILLMService
 from pipecat.utils.time import time_now_iso8601
@@ -190,7 +192,7 @@ def __init__(
         self._api_key = api_key
         self._azure_endpoint = endpoint
         self._api_version = api_version
-        self._model = model
+        self.set_model_name(model)
         self._image_size = image_size
         self._aiohttp_session = aiohttp_session
 
diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index 078926235..ab6026052 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -89,7 +89,7 @@ def __init__(
         self._cartesia_version = cartesia_version
         self._url = url
         self._voice_id = voice_id
-        self._model_id = model_id
+        self.set_model_name(model_id)
         self._output_format = {
             "container": "raw",
             "encoding": encoding,
@@ -105,8 +105,8 @@ def can_generate_metrics(self) -> bool:
         return True
 
     async def set_model(self, model: str):
+        await super().set_model(model)
         logger.debug(f"Switching TTS model to: [{model}]")
-        self._model_id = model
 
     async def set_voice(self, voice: str):
         logger.debug(f"Switching TTS voice to: [{voice}]")
@@ -155,6 +155,11 @@ async def _disconnect(self):
         except Exception as e:
             logger.error(f"{self} error closing websocket: {e}")
 
+    def _get_websocket(self):
+        if self._websocket:
+            return self._websocket
+        raise Exception("Websocket not connected")
+
     async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
         await super()._handle_interruption(frame, direction)
         await self.stop_all_metrics()
@@ -169,7 +174,7 @@ async def flush_audio(self):
             "transcript": "",
             "continue": False,
             "context_id": self._context_id,
-            "model_id": self._model_id,
+            "model_id": self.model_name,
             "voice": {
                 "mode": "id",
                 "id": self._voice_id
@@ -182,7 +187,7 @@ async def flush_audio(self):
 
     async def _receive_task_handler(self):
         try:
-            async for message in self._websocket:
+            async for message in self._get_websocket():
                 msg = json.loads(message)
                 if not msg or msg["context_id"] != self._context_id:
                     continue
@@ -235,7 +240,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 "transcript": text + " ",
                 "continue": True,
                 "context_id": self._context_id,
-                "model_id": self._model_id,
+                "model_id": self.model_name,
                 "voice": {
                     "mode": "id",
                     "id": self._voice_id
@@ -245,7 +250,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 "add_timestamps": True,
             }
             try:
-                await self._websocket.send(json.dumps(msg))
+                await self._get_websocket().send(json.dumps(msg))
                 await self.start_tts_usage_metrics(text)
             except Exception as e:
                 logger.error(f"{self} error sending message: {e}")
diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py
index 708c3c511..5aebdfbb3 100644
--- a/src/pipecat/services/deepgram.py
+++ b/src/pipecat/services/deepgram.py
@@ -135,6 +135,7 @@ def __init__(self,
         self._connection.on(LiveTranscriptionEvents.Transcript, self._on_message)
 
     async def set_model(self, model: str):
+        await super().set_model(model)
         logger.debug(f"Switching STT model to: [{model}]")
         self._live_options.model = model
         await self._disconnect()
diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py
index 081a6bf5d..40aa25865 100644
--- a/src/pipecat/services/elevenlabs.py
+++ b/src/pipecat/services/elevenlabs.py
@@ -107,7 +107,7 @@ def __init__(
 
         self._api_key = api_key
         self._voice_id = voice_id
-        self._model = model
+        self.set_model_name(model)
         self._url = url
         self._params = params
 
@@ -122,8 +122,8 @@ def can_generate_metrics(self) -> bool:
         return True
 
     async def set_model(self, model: str):
+        await super().set_model(model)
         logger.debug(f"Switching TTS model to: [{model}]")
-        self._model = model
         await self._disconnect()
         await self._connect()
 
@@ -160,7 +160,7 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect
     async def _connect(self):
         try:
             voice_id = self._voice_id
-            model = self._model
+            model = self.model_name
             output_format = self._params.output_format
             url = f"{self._url}/v1/text-to-speech/{voice_id}/stream-input?model_id={model}&output_format={output_format}"
             self._websocket = await websockets.connect(url)
diff --git a/src/pipecat/services/fal.py b/src/pipecat/services/fal.py
index 672135d02..58768180f 100644
--- a/src/pipecat/services/fal.py
+++ b/src/pipecat/services/fal.py
@@ -46,7 +46,7 @@ def __init__(
         **kwargs
     ):
         super().__init__(**kwargs)
-        self._model = model
+        self.set_model_name(model)
         self._params = params
         self._aiohttp_session = aiohttp_session
         if key:
@@ -56,7 +56,7 @@ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
         logger.debug(f"Generating image from prompt: {prompt}")
 
         response = await fal_client.run_async(
-            self._model,
+            self.model_name,
             arguments={"prompt": prompt, **self._params.model_dump(exclude_none=True)}
         )
 
diff --git a/src/pipecat/services/fireworks.py b/src/pipecat/services/fireworks.py
index 7fa4d64e8..87fddd838 100644
--- a/src/pipecat/services/fireworks.py
+++ b/src/pipecat/services/fireworks.py
@@ -22,4 +22,4 @@ def __init__(self,
                  *,
                  model: str = "accounts/fireworks/models/firefunction-v1",
                  base_url: str = "https://api.fireworks.ai/inference/v1"):
-        super().__init__(model, base_url)
+        super().__init__(model=model, base_url=base_url)
diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py
index 7f20f1b8f..b72169b70 100644
--- a/src/pipecat/services/google.py
+++ b/src/pipecat/services/google.py
@@ -50,6 +50,7 @@ def can_generate_metrics(self) -> bool:
         return True
 
     def _create_client(self, model: str):
+        self.set_model_name(model)
         self._client = gai.GenerativeModel(model)
 
     def _get_messages_from_openai_context(
diff --git a/src/pipecat/services/moondream.py b/src/pipecat/services/moondream.py
index 3441aeeb9..b6391cc93 100644
--- a/src/pipecat/services/moondream.py
+++ b/src/pipecat/services/moondream.py
@@ -54,6 +54,8 @@ def __init__(
     ):
         super().__init__(**kwargs)
 
+        self.set_model_name(model)
+
         if not use_cpu:
             device, dtype = detect_device()
         else:
@@ -73,7 +75,7 @@ def __init__(
 
     async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]:
         if not self._model:
-            logger.error(f"{self} error: Moondream model not available")
+            logger.error(f"{self} error: Moondream model not available ({self.model_name})")
             yield ErrorFrame("Moondream model not available")
             return
 
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index 7483e2eb5..6fe710b5e 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -33,6 +33,7 @@
     FunctionCallInProgressFrame,
     StartInterruptionFrame
 )
+from pipecat.metrics.metrics import LLMTokenUsage
 from pipecat.processors.aggregators.llm_response import LLMUserContextAggregator, LLMAssistantContextAggregator
 
 from pipecat.processors.aggregators.openai_llm_context import (
@@ -83,7 +84,7 @@ class BaseOpenAILLMService(LLMService):
 
     def __init__(self, *, model: str, api_key=None, base_url=None, **kwargs):
         super().__init__(**kwargs)
-        self._model: str = model
+        self.set_model_name(model)
         self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs)
 
     def create_client(self, api_key=None, base_url=None, **kwargs):
@@ -104,7 +105,7 @@ async def get_chat_completions(
             context: OpenAILLMContext,
             messages: List[ChatCompletionMessageParam]) -> AsyncStream[ChatCompletionChunk]:
         chunks = await self._client.chat.completions.create(
-            model=self._model,
+            model=self.model_name,
             stream=True,
             messages=messages,
             tools=context.tools,
@@ -148,13 +149,11 @@ async def _process_context(self, context: OpenAILLMContext):
 
         async for chunk in chunk_stream:
             if chunk.usage:
-                tokens = {
-                    "processor": self.name,
-                    "model": self._model,
-                    "prompt_tokens": chunk.usage.prompt_tokens,
-                    "completion_tokens": chunk.usage.completion_tokens,
-                    "total_tokens": chunk.usage.total_tokens
-                }
+                tokens = LLMTokenUsage(
+                    prompt_tokens=chunk.usage.prompt_tokens,
+                    completion_tokens=chunk.usage.completion_tokens,
+                    total_tokens=chunk.usage.total_tokens
+                )
                 await self.start_llm_usage_metrics(tokens)
 
             if len(chunk.choices) == 0:
@@ -223,7 +222,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             context = OpenAILLMContext.from_image_frame(frame)
         elif isinstance(frame, LLMModelUpdateFrame):
             logger.debug(f"Switching LLM model to: [{frame.model}]")
-            self._model = frame.model
+            self.set_model_name(frame.model)
         else:
             await self.push_frame(frame, direction)
 
@@ -273,7 +272,7 @@ def __init__(
         model: str = "dall-e-3",
     ):
         super().__init__()
-        self._model = model
+        self.set_model_name(model)
         self._image_size = image_size
         self._client = AsyncOpenAI(api_key=api_key)
         self._aiohttp_session = aiohttp_session
@@ -283,7 +282,7 @@ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
 
         image = await self._client.images.generate(
             prompt=prompt,
-            model=self._model,
+            model=self.model_name,
             n=1,
             size=self._image_size
         )
@@ -325,7 +324,7 @@ def __init__(
         super().__init__(sample_rate=sample_rate, **kwargs)
 
         self._voice: ValidVoice = VALID_VOICES.get(voice, "alloy")
-        self._model = model
+        self.set_model_name(model)
         self._sample_rate = sample_rate
 
         self._client = AsyncOpenAI(api_key=api_key)
@@ -348,7 +347,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
 
             async with self._client.audio.speech.with_streaming_response.create(
                     input=text,
-                    model=self._model,
+                    model=self.model_name,
                     voice=self._voice,
                     response_format="pcm",
             ) as r:
diff --git a/src/pipecat/services/openpipe.py b/src/pipecat/services/openpipe.py
index ada7824fb..e4e14dc15 100644
--- a/src/pipecat/services/openpipe.py
+++ b/src/pipecat/services/openpipe.py
@@ -60,7 +60,7 @@ async def get_chat_completions(
             context: OpenAILLMContext,
             messages: List[ChatCompletionMessageParam]) -> AsyncStream[ChatCompletionChunk]:
         chunks = await self._client.chat.completions.create(
-            model=self._model,
+            model=self.model_name,
             stream=True,
             messages=messages,
             openpipe={
diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py
index 49759cb01..004236ac8 100644
--- a/src/pipecat/services/together.py
+++ b/src/pipecat/services/together.py
@@ -18,9 +18,7 @@
     Frame,
     LLMModelUpdateFrame,
     TextFrame,
-    VisionImageRawFrame,
     UserImageRequestFrame,
-    UserImageRawFrame,
     LLMMessagesFrame,
     LLMFullResponseStartFrame,
     LLMFullResponseEndFrame,
@@ -28,6 +26,7 @@
     FunctionCallInProgressFrame,
     StartInterruptionFrame
 )
+from pipecat.metrics.metrics import LLMTokenUsage
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.ai_services import LLMService
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame
@@ -69,7 +68,7 @@ def __init__(
             **kwargs):
         super().__init__(**kwargs)
         self._client = AsyncTogether(api_key=api_key)
-        self._model = model
+        self.set_model_name(model)
         self._max_tokens = max_tokens
 
     def can_generate_metrics(self) -> bool:
@@ -95,7 +94,7 @@ async def _process_context(self, context: OpenAILLMContext):
 
             stream = await self._client.chat.completions.create(
                 messages=context.messages,
-                model=self._model,
+                model=self.model_name,
                 max_tokens=self._max_tokens,
                 stream=True,
             )
@@ -108,13 +107,11 @@ async def _process_context(self, context: OpenAILLMContext):
             async for chunk in stream:
                 # logger.debug(f"Together LLM event: {chunk}")
                 if chunk.usage:
-                    tokens = {
-                        "processor": self.name,
-                        "model": self._model,
-                        "prompt_tokens": chunk.usage.prompt_tokens,
-                        "completion_tokens": chunk.usage.completion_tokens,
-                        "total_tokens": chunk.usage.total_tokens
-                    }
+                    tokens = LLMTokenUsage(
+                        prompt_tokens=chunk.usage.prompt_tokens,
+                        completion_tokens=chunk.usage.completion_tokens,
+                        total_tokens=chunk.usage.total_tokens
+                    )
                     await self.start_llm_usage_metrics(tokens)
 
                 if len(chunk.choices) == 0:
@@ -156,7 +153,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             context = TogetherLLMContext.from_messages(frame.messages)
         elif isinstance(frame, LLMModelUpdateFrame):
             logger.debug(f"Switching LLM model to: [{frame.model}]")
-            self._model = frame.model
+            self.set_model_name(frame.model)
         else:
             await self.push_frame(frame, direction)
 
diff --git a/src/pipecat/services/whisper.py b/src/pipecat/services/whisper.py
index 04f357a94..9f54f9ca0 100644
--- a/src/pipecat/services/whisper.py
+++ b/src/pipecat/services/whisper.py
@@ -52,7 +52,7 @@ def __init__(self,
         super().__init__(**kwargs)
         self._device: str = device
         self._compute_type = compute_type
-        self._model_name: str | Model = model
+        self.set_model_name(model if isinstance(model, str) else model.value)
         self._no_speech_prob = no_speech_prob
         self._model: WhisperModel | None = None
         self._load()
@@ -65,7 +65,7 @@ def _load(self):
         this model is being run, it will take time to download."""
         logger.debug("Loading Whisper model...")
         self._model = WhisperModel(
-            self._model_name.value if isinstance(self._model_name, Enum) else self._model_name,
+            self.model_name,
             device=self._device,
             compute_type=self._compute_type)
         logger.debug("Loaded Whisper model")
diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py
index 2a45adf36..e28fe6083 100644
--- a/src/pipecat/transports/services/daily.py
+++ b/src/pipecat/transports/services/daily.py
@@ -35,6 +35,7 @@
     TransportMessageFrame,
     UserImageRawFrame,
     UserImageRequestFrame)
+from pipecat.metrics.metrics import LLMUsageMetricsData, ProcessingMetricsData, TTFBMetricsData, TTSUsageMetricsData
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transcriptions.language import Language
 from pipecat.transports.base_input import BaseInputTransport
@@ -731,14 +732,23 @@ async def send_message(self, frame: TransportMessageFrame):
 
     async def send_metrics(self, frame: MetricsFrame):
         metrics = {}
-        if frame.ttfb:
-            metrics["ttfb"] = frame.ttfb
-        if frame.processing:
-            metrics["processing"] = frame.processing
-        if frame.tokens:
-            metrics["tokens"] = frame.tokens
-        if frame.characters:
-            metrics["characters"] = frame.characters
+        for d in frame.data:
+            if isinstance(d, TTFBMetricsData):
+                if "ttfb" not in metrics:
+                    metrics["ttfb"] = []
+                metrics["ttfb"].append(d.model_dump())
+            elif isinstance(d, ProcessingMetricsData):
+                if "processing" not in metrics:
+                    metrics["processing"] = []
+                metrics["processing"].append(d.model_dump())
+            elif isinstance(d, LLMUsageMetricsData):
+                if "tokens" not in metrics:
+                    metrics["tokens"] = []
+                metrics["tokens"].append(d.value.model_dump(exclude_none=True))
+            elif isinstance(d, TTSUsageMetricsData):
+                if "characters" not in metrics:
+                    metrics["characters"] = []
+                metrics["characters"].append(d.model_dump())
 
         message = DailyTransportMessageFrame(message={
             "type": "pipecat-metrics",

From 7e39d9ad3d8ee7b38cc32911cb8e97b64a42a8c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Thu, 19 Sep 2024 19:31:56 -0700
Subject: [PATCH 04/10] introduce input/output audio and image frames

We now distinguish between input and output audio and image frames. We introduce
`InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame` and
`OutputImageRawFrame` (and other subclasses of those). The input frames usually
come from an input transport and are meant to be processed inside the pipeline
to generate new frames. However, the input frames will not be sent through an
output transport. The output frames can also be processed by any frame processor
in the pipeline and they are allowed to be sent by the output transport.
---
 CHANGELOG.md                                  |   9 ++
 examples/dialin-chatbot/requirements.txt      |   2 +-
 .../05a-local-sync-speech-and-image.py        |  12 +-
 examples/foundational/06a-image-sync.py       |  13 +-
 examples/foundational/09-mirror.py            |  25 +++-
 examples/foundational/09a-local-mirror.py     |  23 +++-
 examples/foundational/11-sound-effects.py     |   6 +-
 examples/moondream-chatbot/bot.py             |  11 +-
 examples/moondream-chatbot/requirements.txt   |   2 +-
 examples/patient-intake/bot.py                |   7 +-
 examples/patient-intake/requirements.txt      |   2 +-
 examples/simple-chatbot/bot.py                |  12 +-
 examples/simple-chatbot/requirements.txt      |   2 +-
 .../storytelling-chatbot/requirements.txt     |   2 +-
 .../storytelling-chatbot/src/utils/helpers.py |  11 +-
 examples/twilio-chatbot/README.md             |   2 +-
 examples/twilio-chatbot/bot.py                | 118 +++++++++---------
 examples/twilio-chatbot/requirements.txt      |   2 +-
 examples/websocket-server/bot.py              | 106 ++++++++--------
 examples/websocket-server/frames.proto        |   1 +
 examples/websocket-server/requirements.txt    |   2 +-
 src/pipecat/frames/frames.proto               |   1 +
 src/pipecat/frames/frames.py                  |  66 +++++++---
 src/pipecat/frames/protobufs/frames_pb2.py    |  12 +-
 .../processors/aggregators/llm_response.py    |   6 +-
 .../aggregators/openai_llm_context.py         |   6 +-
 .../aggregators/vision_image_frame.py         |  12 +-
 .../processors/gstreamer/pipeline_source.py   |  12 +-
 src/pipecat/serializers/livekit.py            |  13 +-
 src/pipecat/serializers/protobuf.py           |  24 ++--
 src/pipecat/serializers/twilio.py             |   9 +-
 src/pipecat/services/ai_services.py           |   3 +-
 src/pipecat/services/azure.py                 |   4 +-
 src/pipecat/services/cartesia.py              |   6 +-
 src/pipecat/services/deepgram.py              |   5 +-
 src/pipecat/services/elevenlabs.py            |   4 +-
 src/pipecat/services/lmnt.py                  |   4 +-
 src/pipecat/services/openai.py                |   4 +-
 src/pipecat/services/playht.py                |   8 +-
 src/pipecat/services/together.py              |  12 +-
 src/pipecat/services/xtts.py                  |   6 +-
 src/pipecat/transports/base_input.py          |   6 +-
 src/pipecat/transports/base_output.py         |  39 +++---
 src/pipecat/transports/local/audio.py         |   8 +-
 src/pipecat/transports/local/tk.py            |  13 +-
 .../transports/network/fastapi_websocket.py   |  18 ++-
 .../transports/network/websocket_server.py    |   9 +-
 src/pipecat/transports/services/daily.py      |  20 +--
 48 files changed, 425 insertions(+), 275 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 985834aba..1a89a97b8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -63,6 +63,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- We now distinguish between input and output audio and image frames. We
+  introduce `InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame`
+  and `OutputImageRawFrame` (and other subclasses of those). The input frames
+  usually come from an input transport and are meant to be processed inside the
+  pipeline to generate new frames. However, the input frames will not be sent
+  through an output transport. The output frames can also be processed by any
+  frame processor in the pipeline and they are allowed to be sent by the output
+  transport.
+
 - `ParallelTask` has been renamed to `SyncParallelPipeline`. A
   `SyncParallelPipeline` is a frame processor that contains a list of different
   pipelines to be executed concurrently. The difference between a
diff --git a/examples/dialin-chatbot/requirements.txt b/examples/dialin-chatbot/requirements.txt
index e59a9c3d2..1e15004b1 100644
--- a/examples/dialin-chatbot/requirements.txt
+++ b/examples/dialin-chatbot/requirements.txt
@@ -1,4 +1,4 @@
-pipecat-ai[daily,openai,silero]
+pipecat-ai[daily,elevenlabs,openai,silero]
 fastapi
 uvicorn
 python-dotenv
diff --git a/examples/foundational/05a-local-sync-speech-and-image.py b/examples/foundational/05a-local-sync-speech-and-image.py
index 66a6e7f57..d9a0e792e 100644
--- a/examples/foundational/05a-local-sync-speech-and-image.py
+++ b/examples/foundational/05a-local-sync-speech-and-image.py
@@ -11,7 +11,13 @@
 
 import tkinter as tk
 
-from pipecat.frames.frames import AudioRawFrame, Frame, URLImageRawFrame, LLMMessagesFrame, TextFrame
+from pipecat.frames.frames import (
+    Frame,
+    OutputAudioRawFrame,
+    TTSAudioRawFrame,
+    URLImageRawFrame,
+    LLMMessagesFrame,
+    TextFrame)
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline
@@ -65,9 +71,9 @@ def __init__(self):
                 async def process_frame(self, frame: Frame, direction: FrameDirection):
                     await super().process_frame(frame, direction)
 
-                    if isinstance(frame, AudioRawFrame):
+                    if isinstance(frame, TTSAudioRawFrame):
                         self.audio.extend(frame.audio)
-                        self.frame = AudioRawFrame(
+                        self.frame = OutputAudioRawFrame(
                             bytes(self.audio), frame.sample_rate, frame.num_channels)
 
             class ImageGrabber(FrameProcessor):
diff --git a/examples/foundational/06a-image-sync.py b/examples/foundational/06a-image-sync.py
index 6b3e58cf7..db48b709e 100644
--- a/examples/foundational/06a-image-sync.py
+++ b/examples/foundational/06a-image-sync.py
@@ -11,7 +11,7 @@
 
 from PIL import Image
 
-from pipecat.frames.frames import ImageRawFrame, Frame, SystemFrame, TextFrame
+from pipecat.frames.frames import Frame, OutputImageRawFrame, SystemFrame, TextFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
@@ -52,9 +52,16 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
         if not isinstance(frame, SystemFrame) and direction == FrameDirection.DOWNSTREAM:
-            await self.push_frame(ImageRawFrame(image=self._speaking_image_bytes, size=(1024, 1024), format=self._speaking_image_format))
+            await self.push_frame(OutputImageRawFrame(
+                image=self._speaking_image_bytes,
+                size=(1024, 1024),
+                format=self._speaking_image_format)
+            )
             await self.push_frame(frame)
-            await self.push_frame(ImageRawFrame(image=self._waiting_image_bytes, size=(1024, 1024), format=self._waiting_image_format))
+            await self.push_frame(OutputImageRawFrame(
+                image=self._waiting_image_bytes,
+                size=(1024, 1024),
+                format=self._waiting_image_format))
         else:
             await self.push_frame(frame)
 
diff --git a/examples/foundational/09-mirror.py b/examples/foundational/09-mirror.py
index 8f5f1073b..bb6253deb 100644
--- a/examples/foundational/09-mirror.py
+++ b/examples/foundational/09-mirror.py
@@ -8,9 +8,11 @@
 import asyncio
 import sys
 
+from pipecat.frames.frames import Frame, InputAudioRawFrame, InputImageRawFrame, OutputAudioRawFrame, OutputImageRawFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transports.services.daily import DailyTransport, DailyParams
 
 from runner import configure
@@ -24,6 +26,27 @@
 logger.add(sys.stderr, level="DEBUG")
 
 
+class MirrorProcessor(FrameProcessor):
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, InputAudioRawFrame):
+            await self.push_frame(OutputAudioRawFrame(
+                audio=frame.audio,
+                sample_rate=frame.sample_rate,
+                num_channels=frame.num_channels)
+            )
+        elif isinstance(frame, InputImageRawFrame):
+            await self.push_frame(OutputImageRawFrame(
+                image=frame.image,
+                size=frame.size,
+                format=frame.format)
+            )
+        else:
+            await self.push_frame(frame, direction)
+
+
 async def main():
     async with aiohttp.ClientSession() as session:
         (room_url, token) = await configure(session)
@@ -44,7 +67,7 @@ async def main():
         async def on_first_participant_joined(transport, participant):
             transport.capture_participant_video(participant["id"])
 
-        pipeline = Pipeline([transport.input(), transport.output()])
+        pipeline = Pipeline([transport.input(), MirrorProcessor(), transport.output()])
 
         runner = PipelineRunner()
 
diff --git a/examples/foundational/09a-local-mirror.py b/examples/foundational/09a-local-mirror.py
index d657a3631..afc77470d 100644
--- a/examples/foundational/09a-local-mirror.py
+++ b/examples/foundational/09a-local-mirror.py
@@ -10,9 +10,11 @@
 
 import tkinter as tk
 
+from pipecat.frames.frames import Frame, InputAudioRawFrame, InputImageRawFrame, OutputAudioRawFrame, OutputImageRawFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.local.tk import TkLocalTransport
 from pipecat.transports.services.daily import DailyParams, DailyTransport
@@ -27,6 +29,25 @@
 logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")
 
+class MirrorProcessor(FrameProcessor):
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, InputAudioRawFrame):
+            await self.push_frame(OutputAudioRawFrame(
+                audio=frame.audio,
+                sample_rate=frame.sample_rate,
+                num_channels=frame.num_channels)
+            )
+        elif isinstance(frame, InputImageRawFrame):
+            await self.push_frame(OutputImageRawFrame(
+                image=frame.image,
+                size=frame.size,
+                format=frame.format)
+            )
+        else:
+            await self.push_frame(frame, direction)
 
 async def main():
     async with aiohttp.ClientSession() as session:
@@ -52,7 +73,7 @@ async def main():
         async def on_first_participant_joined(transport, participant):
             transport.capture_participant_video(participant["id"])
 
-        pipeline = Pipeline([daily_transport.input(), tk_transport.output()])
+        pipeline = Pipeline([daily_transport.input(), MirrorProcessor(), tk_transport.output()])
 
         task = PipelineTask(pipeline)
 
diff --git a/examples/foundational/11-sound-effects.py b/examples/foundational/11-sound-effects.py
index 9dc4dc99b..21b03bedf 100644
--- a/examples/foundational/11-sound-effects.py
+++ b/examples/foundational/11-sound-effects.py
@@ -12,9 +12,9 @@
 
 from pipecat.frames.frames import (
     Frame,
-    AudioRawFrame,
     LLMFullResponseEndFrame,
     LLMMessagesFrame,
+    OutputAudioRawFrame,
 )
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
@@ -53,8 +53,8 @@
     filename = os.path.splitext(os.path.basename(full_path))[0]
     # Open the image and convert it to bytes
     with wave.open(full_path) as audio_file:
-        sounds[file] = AudioRawFrame(audio_file.readframes(-1),
-                                     audio_file.getframerate(), audio_file.getnchannels())
+        sounds[file] = OutputAudioRawFrame(audio_file.readframes(-1),
+                                           audio_file.getframerate(), audio_file.getnchannels())
 
 
 class OutboundSoundEffectWrapper(FrameProcessor):
diff --git a/examples/moondream-chatbot/bot.py b/examples/moondream-chatbot/bot.py
index 32bbf7e6b..d14a5f016 100644
--- a/examples/moondream-chatbot/bot.py
+++ b/examples/moondream-chatbot/bot.py
@@ -13,10 +13,11 @@
 
 from pipecat.frames.frames import (
     ImageRawFrame,
+    OutputImageRawFrame,
     SpriteFrame,
     Frame,
     LLMMessagesFrame,
-    AudioRawFrame,
+    TTSAudioRawFrame,
     TTSStoppedFrame,
     TextFrame,
     UserImageRawFrame,
@@ -59,7 +60,11 @@
     # Get the filename without the extension to use as the dictionary key
     # Open the image and convert it to bytes
     with Image.open(full_path) as img:
-        sprites.append(ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
+        sprites.append(OutputImageRawFrame(
+            image=img.tobytes(),
+            size=img.size,
+            format=img.format)
+        )
 
 flipped = sprites[::-1]
 sprites.extend(flipped)
@@ -82,7 +87,7 @@ def __init__(self):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
-        if isinstance(frame, AudioRawFrame):
+        if isinstance(frame, TTSAudioRawFrame):
             if not self._is_talking:
                 await self.push_frame(talking_frame)
                 self._is_talking = True
diff --git a/examples/moondream-chatbot/requirements.txt b/examples/moondream-chatbot/requirements.txt
index 11132e136..08fd27cb7 100644
--- a/examples/moondream-chatbot/requirements.txt
+++ b/examples/moondream-chatbot/requirements.txt
@@ -1,4 +1,4 @@
 python-dotenv
 fastapi[all]
 uvicorn
-pipecat-ai[daily,moondream,openai,silero]
+pipecat-ai[daily,cartesia,moondream,openai,silero]
diff --git a/examples/patient-intake/bot.py b/examples/patient-intake/bot.py
index 7dc404c52..33ca9e26d 100644
--- a/examples/patient-intake/bot.py
+++ b/examples/patient-intake/bot.py
@@ -10,7 +10,7 @@
 import sys
 import wave
 
-from pipecat.frames.frames import AudioRawFrame
+from pipecat.frames.frames import OutputAudioRawFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -49,8 +49,9 @@
     filename = os.path.splitext(os.path.basename(full_path))[0]
     # Open the sound and convert it to bytes
     with wave.open(full_path) as audio_file:
-        sounds[file] = AudioRawFrame(audio_file.readframes(-1),
-                                     audio_file.getframerate(), audio_file.getnchannels())
+        sounds[file] = OutputAudioRawFrame(audio_file.readframes(-1),
+                                           audio_file.getframerate(),
+                                           audio_file.getnchannels())
 
 
 class IntakeProcessor:
diff --git a/examples/patient-intake/requirements.txt b/examples/patient-intake/requirements.txt
index a7a8729df..e8bfcd8e4 100644
--- a/examples/patient-intake/requirements.txt
+++ b/examples/patient-intake/requirements.txt
@@ -1,4 +1,4 @@
 python-dotenv
 fastapi[all]
 uvicorn
-pipecat-ai[daily,openai,silero]
+pipecat-ai[daily,cartesia,openai,silero]
diff --git a/examples/simple-chatbot/bot.py b/examples/simple-chatbot/bot.py
index 1664e47fb..f179dfeb5 100644
--- a/examples/simple-chatbot/bot.py
+++ b/examples/simple-chatbot/bot.py
@@ -16,11 +16,11 @@
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
 from pipecat.frames.frames import (
-    AudioRawFrame,
-    ImageRawFrame,
+    OutputImageRawFrame,
     SpriteFrame,
     Frame,
     LLMMessagesFrame,
+    TTSAudioRawFrame,
     TTSStoppedFrame
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -49,7 +49,11 @@
     # Get the filename without the extension to use as the dictionary key
     # Open the image and convert it to bytes
     with Image.open(full_path) as img:
-        sprites.append(ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
+        sprites.append(OutputImageRawFrame(
+            image=img.tobytes(),
+            size=img.size,
+            format=img.format)
+        )
 
 flipped = sprites[::-1]
 sprites.extend(flipped)
@@ -72,7 +76,7 @@ def __init__(self):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
-        if isinstance(frame, AudioRawFrame):
+        if isinstance(frame, TTSAudioRawFrame):
             if not self._is_talking:
                 await self.push_frame(talking_frame)
                 self._is_talking = True
diff --git a/examples/simple-chatbot/requirements.txt b/examples/simple-chatbot/requirements.txt
index a7a8729df..a4e6aa1db 100644
--- a/examples/simple-chatbot/requirements.txt
+++ b/examples/simple-chatbot/requirements.txt
@@ -1,4 +1,4 @@
 python-dotenv
 fastapi[all]
 uvicorn
-pipecat-ai[daily,openai,silero]
+pipecat-ai[daily,elevenlabs,openai,silero]
diff --git a/examples/storytelling-chatbot/requirements.txt b/examples/storytelling-chatbot/requirements.txt
index 663f78a76..0cebe6edb 100644
--- a/examples/storytelling-chatbot/requirements.txt
+++ b/examples/storytelling-chatbot/requirements.txt
@@ -2,4 +2,4 @@ async_timeout
 fastapi
 uvicorn
 python-dotenv
-pipecat-ai[daily,openai,fal]
+pipecat-ai[daily,elevenlabs,openai,fal]
diff --git a/examples/storytelling-chatbot/src/utils/helpers.py b/examples/storytelling-chatbot/src/utils/helpers.py
index 2c576fdff..743a04c97 100644
--- a/examples/storytelling-chatbot/src/utils/helpers.py
+++ b/examples/storytelling-chatbot/src/utils/helpers.py
@@ -2,7 +2,7 @@
 import wave
 from PIL import Image
 
-from pipecat.frames.frames import AudioRawFrame, ImageRawFrame
+from pipecat.frames.frames import OutputAudioRawFrame, OutputImageRawFrame
 
 script_dir = os.path.dirname(__file__)
 
@@ -16,7 +16,8 @@ def load_images(image_files):
         filename = os.path.splitext(os.path.basename(full_path))[0]
         # Open the image and convert it to bytes
         with Image.open(full_path) as img:
-            images[filename] = ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)
+            images[filename] = OutputImageRawFrame(
+                image=img.tobytes(), size=img.size, format=img.format)
     return images
 
 
@@ -30,8 +31,8 @@ def load_sounds(sound_files):
         filename = os.path.splitext(os.path.basename(full_path))[0]
         # Open the sound and convert it to bytes
         with wave.open(full_path) as audio_file:
-            sounds[filename] = AudioRawFrame(audio=audio_file.readframes(-1),
-                                             sample_rate=audio_file.getframerate(),
-                                             num_channels=audio_file.getnchannels())
+            sounds[filename] = OutputAudioRawFrame(audio=audio_file.readframes(-1),
+                                                   sample_rate=audio_file.getframerate(),
+                                                   num_channels=audio_file.getnchannels())
 
     return sounds
diff --git a/examples/twilio-chatbot/README.md b/examples/twilio-chatbot/README.md
index 5d5d2385a..fdea359f9 100644
--- a/examples/twilio-chatbot/README.md
+++ b/examples/twilio-chatbot/README.md
@@ -55,7 +55,7 @@ This project is a FastAPI-based chatbot that integrates with Twilio to handle We
 2. **Update the Twilio Webhook**:
     Copy the ngrok URL and update your Twilio phone number webhook URL to `http://<ngrok_url>/start_call`.
 
-3. **Update the streams.xml**:
+3. **Update streams.xml**:
     Copy the ngrok URL and update templates/streams.xml with `wss://<ngrok_url>/ws`.
 
 ## Running the Application
diff --git a/examples/twilio-chatbot/bot.py b/examples/twilio-chatbot/bot.py
index b7376e084..5b83139f9 100644
--- a/examples/twilio-chatbot/bot.py
+++ b/examples/twilio-chatbot/bot.py
@@ -1,4 +1,3 @@
-import aiohttp
 import os
 import sys
 
@@ -27,63 +26,62 @@
 
 
 async def run_bot(websocket_client, stream_sid):
-    async with aiohttp.ClientSession() as session:
-        transport = FastAPIWebsocketTransport(
-            websocket=websocket_client,
-            params=FastAPIWebsocketParams(
-                audio_out_enabled=True,
-                add_wav_header=False,
-                vad_enabled=True,
-                vad_analyzer=SileroVADAnalyzer(),
-                vad_audio_passthrough=True,
-                serializer=TwilioFrameSerializer(stream_sid)
-            )
+    transport = FastAPIWebsocketTransport(
+        websocket=websocket_client,
+        params=FastAPIWebsocketParams(
+            audio_out_enabled=True,
+            add_wav_header=False,
+            vad_enabled=True,
+            vad_analyzer=SileroVADAnalyzer(),
+            vad_audio_passthrough=True,
+            serializer=TwilioFrameSerializer(stream_sid)
         )
-
-        llm = OpenAILLMService(
-            api_key=os.getenv("OPENAI_API_KEY"),
-            model="gpt-4o")
-
-        stt = DeepgramSTTService(api_key=os.getenv('DEEPGRAM_API_KEY'))
-
-        tts = CartesiaTTSService(
-            api_key=os.getenv("CARTESIA_API_KEY"),
-            voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
-        )
-
-        messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful LLM in an audio call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
-            },
-        ]
-
-        tma_in = LLMUserResponseAggregator(messages)
-        tma_out = LLMAssistantResponseAggregator(messages)
-
-        pipeline = Pipeline([
-            transport.input(),   # Websocket input from client
-            stt,                 # Speech-To-Text
-            tma_in,              # User responses
-            llm,                 # LLM
-            tts,                 # Text-To-Speech
-            transport.output(),  # Websocket output to client
-            tma_out              # LLM responses
-        ])
-
-        task = PipelineTask(pipeline, params=PipelineParams(allow_interruptions=True))
-
-        @transport.event_handler("on_client_connected")
-        async def on_client_connected(transport, client):
-            # Kick off the conversation.
-            messages.append(
-                {"role": "system", "content": "Please introduce yourself to the user."})
-            await task.queue_frames([LLMMessagesFrame(messages)])
-
-        @transport.event_handler("on_client_disconnected")
-        async def on_client_disconnected(transport, client):
-            await task.queue_frames([EndFrame()])
-
-        runner = PipelineRunner(handle_sigint=False)
-
-        await runner.run(task)
+    )
+
+    llm = OpenAILLMService(
+        api_key=os.getenv("OPENAI_API_KEY"),
+        model="gpt-4o")
+
+    stt = DeepgramSTTService(api_key=os.getenv('DEEPGRAM_API_KEY'))
+
+    tts = CartesiaTTSService(
+        api_key=os.getenv("CARTESIA_API_KEY"),
+        voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
+    )
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful LLM in an audio call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+        },
+    ]
+
+    tma_in = LLMUserResponseAggregator(messages)
+    tma_out = LLMAssistantResponseAggregator(messages)
+
+    pipeline = Pipeline([
+        transport.input(),   # Websocket input from client
+        stt,                 # Speech-To-Text
+        tma_in,              # User responses
+        llm,                 # LLM
+        tts,                 # Text-To-Speech
+        transport.output(),  # Websocket output to client
+        tma_out              # LLM responses
+    ])
+
+    task = PipelineTask(pipeline, params=PipelineParams(allow_interruptions=True))
+
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(transport, client):
+        # Kick off the conversation.
+        messages.append(
+            {"role": "system", "content": "Please introduce yourself to the user."})
+        await task.queue_frames([LLMMessagesFrame(messages)])
+
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(transport, client):
+        await task.queue_frames([EndFrame()])
+
+    runner = PipelineRunner(handle_sigint=False)
+
+    await runner.run(task)
diff --git a/examples/twilio-chatbot/requirements.txt b/examples/twilio-chatbot/requirements.txt
index f0456fcd5..eefaca888 100644
--- a/examples/twilio-chatbot/requirements.txt
+++ b/examples/twilio-chatbot/requirements.txt
@@ -1,4 +1,4 @@
-pipecat-ai[daily,openai,silero,deepgram]
+pipecat-ai[daily,cartesia,openai,silero,deepgram]
 fastapi
 uvicorn
 python-dotenv
diff --git a/examples/websocket-server/bot.py b/examples/websocket-server/bot.py
index 29a99614f..61d285fa8 100644
--- a/examples/websocket-server/bot.py
+++ b/examples/websocket-server/bot.py
@@ -4,7 +4,6 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-import aiohttp
 import asyncio
 import os
 import sys
@@ -33,60 +32,59 @@
 
 
 async def main():
-    async with aiohttp.ClientSession() as session:
-        transport = WebsocketServerTransport(
-            params=WebsocketServerParams(
-                audio_out_enabled=True,
-                add_wav_header=True,
-                vad_enabled=True,
-                vad_analyzer=SileroVADAnalyzer(),
-                vad_audio_passthrough=True
-            )
+    transport = WebsocketServerTransport(
+        params=WebsocketServerParams(
+            audio_out_enabled=True,
+            add_wav_header=True,
+            vad_enabled=True,
+            vad_analyzer=SileroVADAnalyzer(),
+            vad_audio_passthrough=True
         )
-
-        llm = OpenAILLMService(
-            api_key=os.getenv("OPENAI_API_KEY"),
-            model="gpt-4o")
-
-        stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
-
-        tts = CartesiaTTSService(
-            api_key=os.getenv("CARTESIA_API_KEY"),
-            voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
-        )
-
-        messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
-            },
-        ]
-
-        tma_in = LLMUserResponseAggregator(messages)
-        tma_out = LLMAssistantResponseAggregator(messages)
-
-        pipeline = Pipeline([
-            transport.input(),   # Websocket input from client
-            stt,                 # Speech-To-Text
-            tma_in,              # User responses
-            llm,                 # LLM
-            tts,                 # Text-To-Speech
-            transport.output(),  # Websocket output to client
-            tma_out              # LLM responses
-        ])
-
-        task = PipelineTask(pipeline)
-
-        @transport.event_handler("on_client_connected")
-        async def on_client_connected(transport, client):
-            # Kick off the conversation.
-            messages.append(
-                {"role": "system", "content": "Please introduce yourself to the user."})
-            await task.queue_frames([LLMMessagesFrame(messages)])
-
-        runner = PipelineRunner()
-
-        await runner.run(task)
+    )
+
+    llm = OpenAILLMService(
+        api_key=os.getenv("OPENAI_API_KEY"),
+        model="gpt-4o")
+
+    stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+
+    tts = CartesiaTTSService(
+        api_key=os.getenv("CARTESIA_API_KEY"),
+        voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
+    )
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+        },
+    ]
+
+    tma_in = LLMUserResponseAggregator(messages)
+    tma_out = LLMAssistantResponseAggregator(messages)
+
+    pipeline = Pipeline([
+        transport.input(),   # Websocket input from client
+        stt,                 # Speech-To-Text
+        tma_in,              # User responses
+        llm,                 # LLM
+        tts,                 # Text-To-Speech
+        transport.output(),  # Websocket output to client
+        tma_out              # LLM responses
+    ])
+
+    task = PipelineTask(pipeline)
+
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(transport, client):
+        # Kick off the conversation.
+        messages.append(
+            {"role": "system", "content": "Please introduce yourself to the user."})
+        await task.queue_frames([LLMMessagesFrame(messages)])
+
+    runner = PipelineRunner()
+
+    await runner.run(task)
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/examples/websocket-server/frames.proto b/examples/websocket-server/frames.proto
index 5c5d81d4d..4c58d2a34 100644
--- a/examples/websocket-server/frames.proto
+++ b/examples/websocket-server/frames.proto
@@ -24,6 +24,7 @@ message AudioRawFrame {
   bytes audio = 3;
   uint32 sample_rate = 4;
   uint32 num_channels = 5;
+  optional uint64 pts = 6;
 }
 
 message TranscriptionFrame {
diff --git a/examples/websocket-server/requirements.txt b/examples/websocket-server/requirements.txt
index 77e5b9e91..0815c6b8a 100644
--- a/examples/websocket-server/requirements.txt
+++ b/examples/websocket-server/requirements.txt
@@ -1,2 +1,2 @@
 python-dotenv
-pipecat-ai[openai,silero,websocket,whisper]
+pipecat-ai[cartesia,openai,silero,websocket,whisper]
diff --git a/src/pipecat/frames/frames.proto b/src/pipecat/frames/frames.proto
index 5c5d81d4d..4c58d2a34 100644
--- a/src/pipecat/frames/frames.proto
+++ b/src/pipecat/frames/frames.proto
@@ -24,6 +24,7 @@ message AudioRawFrame {
   bytes audio = 3;
   uint32 sample_rate = 4;
   uint32 num_channels = 5;
+  optional uint64 pts = 6;
 }
 
 message TranscriptionFrame {
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index 4d207fecd..d45487150 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -41,10 +41,7 @@ class DataFrame(Frame):
 
 @dataclass
 class AudioRawFrame(DataFrame):
-    """A chunk of audio. Will be played by the transport if the transport's
-    microphone has been enabled.
-
-    """
+    """A chunk of audio."""
     audio: bytes
     sample_rate: int
     num_channels: int
@@ -58,6 +55,31 @@ def __str__(self):
         return f"{self.name}(pts: {pts}, size: {len(self.audio)}, frames: {self.num_frames}, sample_rate: {self.sample_rate}, channels: {self.num_channels})"
 
 
+@dataclass
+class InputAudioRawFrame(AudioRawFrame):
+    """A chunk of audio usually coming from an input transport.
+
+    """
+    pass
+
+
+@dataclass
+class OutputAudioRawFrame(AudioRawFrame):
+    """A chunk of audio. Will be played by the output transport if the
+    transport's microphone has been enabled.
+
+    """
+    pass
+
+
+@dataclass
+class TTSAudioRawFrame(OutputAudioRawFrame):
+    """A chunk of output audio generated by a TTS service.
+
+    """
+    pass
+
+
 @dataclass
 class ImageRawFrame(DataFrame):
     """An image. Will be shown by the transport if the transport's camera is
@@ -74,20 +96,30 @@ def __str__(self):
 
 
 @dataclass
-class URLImageRawFrame(ImageRawFrame):
-    """An image with an associated URL. Will be shown by the transport if the
+class InputImageRawFrame(ImageRawFrame):
+    pass
+
+
+@dataclass
+class OutputImageRawFrame(ImageRawFrame):
+    pass
+
+
+@dataclass
+class UserImageRawFrame(InputImageRawFrame):
+    """An image associated to a user. Will be shown by the transport if the
     transport's camera is enabled.
 
     """
-    url: str | None
+    user_id: str
 
     def __str__(self):
         pts = format_pts(self.pts)
-        return f"{self.name}(pts: {pts}, url: {self.url}, size: {self.size}, format: {self.format})"
+        return f"{self.name}(pts: {pts}, user: {self.user_id}, size: {self.size}, format: {self.format})"
 
 
 @dataclass
-class VisionImageRawFrame(ImageRawFrame):
+class VisionImageRawFrame(InputImageRawFrame):
     """An image with an associated text to ask for a description of it. Will be
     shown by the transport if the transport's camera is enabled.
 
@@ -100,16 +132,16 @@ def __str__(self):
 
 
 @dataclass
-class UserImageRawFrame(ImageRawFrame):
-    """An image associated to a user. Will be shown by the transport if the
+class URLImageRawFrame(OutputImageRawFrame):
+    """An image with an associated URL. Will be shown by the transport if the
     transport's camera is enabled.
 
     """
-    user_id: str
+    url: str | None
 
     def __str__(self):
         pts = format_pts(self.pts)
-        return f"{self.name}(pts: {pts}, user: {self.user_id}, size: {self.size}, format: {self.format})"
+        return f"{self.name}(pts: {pts}, url: {self.url}, size: {self.size}, format: {self.format})"
 
 
 @dataclass
@@ -420,10 +452,10 @@ class BotSpeakingFrame(ControlFrame):
 @dataclass
 class TTSStartedFrame(ControlFrame):
     """Used to indicate the beginning of a TTS response. Following
-    AudioRawFrames are part of the TTS response until an TTSStoppedFrame. These
-    frames can be used for aggregating audio frames in a transport to optimize
-    the size of frames sent to the session, without needing to control this in
-    the TTS service.
+    TTSAudioRawFrames are part of the TTS response until an
+    TTSStoppedFrame. These frames can be used for aggregating audio frames in a
+    transport to optimize the size of frames sent to the session, without
+    needing to control this in the TTS service.
 
     """
     pass
diff --git a/src/pipecat/frames/protobufs/frames_pb2.py b/src/pipecat/frames/protobufs/frames_pb2.py
index 5040efc97..d58bc8baa 100644
--- a/src/pipecat/frames/protobufs/frames_pb2.py
+++ b/src/pipecat/frames/protobufs/frames_pb2.py
@@ -14,7 +14,7 @@
 
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0c\x66rames.proto\x12\x07pipecat\"3\n\tTextFrame\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\"c\n\rAudioRawFrame\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\r\n\x05\x61udio\x18\x03 \x01(\x0c\x12\x13\n\x0bsample_rate\x18\x04 \x01(\r\x12\x14\n\x0cnum_channels\x18\x05 \x01(\r\"`\n\x12TranscriptionFrame\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x0f\n\x07user_id\x18\x04 \x01(\t\x12\x11\n\ttimestamp\x18\x05 \x01(\t\"\x93\x01\n\x05\x46rame\x12\"\n\x04text\x18\x01 \x01(\x0b\x32\x12.pipecat.TextFrameH\x00\x12\'\n\x05\x61udio\x18\x02 \x01(\x0b\x32\x16.pipecat.AudioRawFrameH\x00\x12\x34\n\rtranscription\x18\x03 \x01(\x0b\x32\x1b.pipecat.TranscriptionFrameH\x00\x42\x07\n\x05\x66rameb\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0c\x66rames.proto\x12\x07pipecat\"3\n\tTextFrame\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\"}\n\rAudioRawFrame\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\r\n\x05\x61udio\x18\x03 \x01(\x0c\x12\x13\n\x0bsample_rate\x18\x04 \x01(\r\x12\x14\n\x0cnum_channels\x18\x05 \x01(\r\x12\x10\n\x03pts\x18\x06 \x01(\x04H\x00\x88\x01\x01\x42\x06\n\x04_pts\"`\n\x12TranscriptionFrame\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x0f\n\x07user_id\x18\x04 \x01(\t\x12\x11\n\ttimestamp\x18\x05 \x01(\t\"\x93\x01\n\x05\x46rame\x12\"\n\x04text\x18\x01 \x01(\x0b\x32\x12.pipecat.TextFrameH\x00\x12\'\n\x05\x61udio\x18\x02 \x01(\x0b\x32\x16.pipecat.AudioRawFrameH\x00\x12\x34\n\rtranscription\x18\x03 \x01(\x0b\x32\x1b.pipecat.TranscriptionFrameH\x00\x42\x07\n\x05\x66rameb\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -24,9 +24,9 @@
   _globals['_TEXTFRAME']._serialized_start=25
   _globals['_TEXTFRAME']._serialized_end=76
   _globals['_AUDIORAWFRAME']._serialized_start=78
-  _globals['_AUDIORAWFRAME']._serialized_end=177
-  _globals['_TRANSCRIPTIONFRAME']._serialized_start=179
-  _globals['_TRANSCRIPTIONFRAME']._serialized_end=275
-  _globals['_FRAME']._serialized_start=278
-  _globals['_FRAME']._serialized_end=425
+  _globals['_AUDIORAWFRAME']._serialized_end=203
+  _globals['_TRANSCRIPTIONFRAME']._serialized_start=205
+  _globals['_TRANSCRIPTIONFRAME']._serialized_end=301
+  _globals['_FRAME']._serialized_start=304
+  _globals['_FRAME']._serialized_end=451
 # @@protoc_insertion_point(module_scope)
diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py
index 379394120..13920c59b 100644
--- a/src/pipecat/processors/aggregators/llm_response.py
+++ b/src/pipecat/processors/aggregators/llm_response.py
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-from typing import List
+from typing import List, Type
 
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame, OpenAILLMContext
 
@@ -34,8 +34,8 @@ def __init__(
         role: str,
         start_frame,
         end_frame,
-        accumulator_frame: TextFrame,
-        interim_accumulator_frame: TextFrame | None = None,
+        accumulator_frame: Type[TextFrame],
+        interim_accumulator_frame: Type[TextFrame] | None = None,
         handle_interruptions: bool = False
     ):
         super().__init__()
diff --git a/src/pipecat/processors/aggregators/openai_llm_context.py b/src/pipecat/processors/aggregators/openai_llm_context.py
index 0d8b19a36..3d1acf32e 100644
--- a/src/pipecat/processors/aggregators/openai_llm_context.py
+++ b/src/pipecat/processors/aggregators/openai_llm_context.py
@@ -13,7 +13,11 @@
 
 from PIL import Image
 
-from pipecat.frames.frames import Frame, VisionImageRawFrame, FunctionCallInProgressFrame, FunctionCallResultFrame
+from pipecat.frames.frames import (
+    Frame,
+    VisionImageRawFrame,
+    FunctionCallInProgressFrame,
+    FunctionCallResultFrame)
 from pipecat.processors.frame_processor import FrameProcessor
 
 from loguru import logger
diff --git a/src/pipecat/processors/aggregators/vision_image_frame.py b/src/pipecat/processors/aggregators/vision_image_frame.py
index 0bbb10841..97f6b5ec8 100644
--- a/src/pipecat/processors/aggregators/vision_image_frame.py
+++ b/src/pipecat/processors/aggregators/vision_image_frame.py
@@ -4,13 +4,19 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-from pipecat.frames.frames import Frame, ImageRawFrame, TextFrame, VisionImageRawFrame
+from pipecat.frames.frames import (
+    Frame,
+    InputImageRawFrame,
+    TextFrame,
+    VisionImageRawFrame
+)
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 
 
 class VisionImageFrameAggregator(FrameProcessor):
     """This aggregator waits for a consecutive TextFrame and an
-    ImageFrame. After the ImageFrame arrives it will output a VisionImageFrame.
+    InputImageRawFrame. After the InputImageRawFrame arrives it will output a
+    VisionImageRawFrame.
 
     >>> from pipecat.frames.frames import ImageFrame
 
@@ -34,7 +40,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
 
         if isinstance(frame, TextFrame):
             self._describe_text = frame.text
-        elif isinstance(frame, ImageRawFrame):
+        elif isinstance(frame, InputImageRawFrame):
             if self._describe_text:
                 frame = VisionImageRawFrame(
                     text=self._describe_text,
diff --git a/src/pipecat/processors/gstreamer/pipeline_source.py b/src/pipecat/processors/gstreamer/pipeline_source.py
index 8d46105a7..f852dd641 100644
--- a/src/pipecat/processors/gstreamer/pipeline_source.py
+++ b/src/pipecat/processors/gstreamer/pipeline_source.py
@@ -9,11 +9,11 @@
 from pydantic import BaseModel
 
 from pipecat.frames.frames import (
-    AudioRawFrame,
     CancelFrame,
     EndFrame,
     Frame,
-    ImageRawFrame,
+    OutputAudioRawFrame,
+    OutputImageRawFrame,
     StartFrame,
     SystemFrame)
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -182,9 +182,9 @@ def _decodebin_video(self, pad: Gst.Pad):
     def _appsink_audio_new_sample(self, appsink: GstApp.AppSink):
         buffer = appsink.pull_sample().get_buffer()
         (_, info) = buffer.map(Gst.MapFlags.READ)
-        frame = AudioRawFrame(audio=info.data,
-                              sample_rate=self._out_params.audio_sample_rate,
-                              num_channels=self._out_params.audio_channels)
+        frame = OutputAudioRawFrame(audio=info.data,
+                                    sample_rate=self._out_params.audio_sample_rate,
+                                    num_channels=self._out_params.audio_channels)
         asyncio.run_coroutine_threadsafe(self.push_frame(frame), self.get_event_loop())
         buffer.unmap(info)
         return Gst.FlowReturn.OK
@@ -192,7 +192,7 @@ def _appsink_audio_new_sample(self, appsink: GstApp.AppSink):
     def _appsink_video_new_sample(self, appsink: GstApp.AppSink):
         buffer = appsink.pull_sample().get_buffer()
         (_, info) = buffer.map(Gst.MapFlags.READ)
-        frame = ImageRawFrame(
+        frame = OutputImageRawFrame(
             image=info.data,
             size=(self._out_params.video_width, self._out_params.video_height),
             format="RGB")
diff --git a/src/pipecat/serializers/livekit.py b/src/pipecat/serializers/livekit.py
index 7a0e8afd1..fec5243f5 100644
--- a/src/pipecat/serializers/livekit.py
+++ b/src/pipecat/serializers/livekit.py
@@ -7,7 +7,10 @@
 import ctypes
 import pickle
 
-from pipecat.frames.frames import AudioRawFrame, Frame
+from pipecat.frames.frames import (
+    Frame,
+    InputAudioRawFrame,
+    OutputAudioRawFrame)
 from pipecat.serializers.base_serializer import FrameSerializer
 
 from loguru import logger
@@ -22,12 +25,8 @@
 
 
 class LivekitFrameSerializer(FrameSerializer):
-    SERIALIZABLE_TYPES = {
-        AudioRawFrame: "audio",
-    }
-
     def serialize(self, frame: Frame) -> str | bytes | None:
-        if not isinstance(frame, AudioRawFrame):
+        if not isinstance(frame, OutputAudioRawFrame):
             return None
         audio_frame = AudioFrame(
             data=frame.audio,
@@ -39,7 +38,7 @@ def serialize(self, frame: Frame) -> str | bytes | None:
 
     def deserialize(self, data: str | bytes) -> Frame | None:
         audio_frame: AudioFrame = pickle.loads(data)['frame']
-        return AudioRawFrame(
+        return InputAudioRawFrame(
             audio=bytes(audio_frame.data),
             sample_rate=audio_frame.sample_rate,
             num_channels=audio_frame.num_channels,
diff --git a/src/pipecat/serializers/protobuf.py b/src/pipecat/serializers/protobuf.py
index 0a6dee0b1..6ae1b0c03 100644
--- a/src/pipecat/serializers/protobuf.py
+++ b/src/pipecat/serializers/protobuf.py
@@ -8,7 +8,11 @@
 
 import pipecat.frames.protobufs.frames_pb2 as frame_protos
 
-from pipecat.frames.frames import AudioRawFrame, Frame, TextFrame, TranscriptionFrame
+from pipecat.frames.frames import (
+    AudioRawFrame,
+    Frame,
+    TextFrame,
+    TranscriptionFrame)
 from pipecat.serializers.base_serializer import FrameSerializer
 
 from loguru import logger
@@ -29,14 +33,15 @@ def __init__(self):
     def serialize(self, frame: Frame) -> str | bytes | None:
         proto_frame = frame_protos.Frame()
         if type(frame) not in self.SERIALIZABLE_TYPES:
-            raise ValueError(
-                f"Frame type {type(frame)} is not serializable. You may need to add it to ProtobufFrameSerializer.SERIALIZABLE_FIELDS.")
+            logger.warning(f"Frame type {type(frame)} is not serializable")
+            return None
 
         # ignoring linter errors; we check that type(frame) is in this dict above
         proto_optional_name = self.SERIALIZABLE_TYPES[type(frame)]  # type: ignore
         for field in dataclasses.fields(frame):  # type: ignore
-            setattr(getattr(proto_frame, proto_optional_name), field.name,
-                    getattr(frame, field.name))
+            value = getattr(frame, field.name)
+            if value:
+                setattr(getattr(proto_frame, proto_optional_name), field.name, value)
 
         result = proto_frame.SerializeToString()
         return result
@@ -48,8 +53,8 @@ def deserialize(self, data: str | bytes) -> Frame | None:
 
         >>> serializer = ProtobufFrameSerializer()
         >>> serializer.deserialize(
-        ...     serializer.serialize(AudioFrame(data=b'1234567890')))
-        AudioFrame(data=b'1234567890')
+        ...     serializer.serialize(OutputAudioFrame(data=b'1234567890')))
+        InputAudioFrame(data=b'1234567890')
 
         >>> serializer.deserialize(
         ...     serializer.serialize(TextFrame(text='hello world')))
@@ -75,10 +80,13 @@ def deserialize(self, data: str | bytes) -> Frame | None:
         # Remove special fields if needed
         id = getattr(args, "id")
         name = getattr(args, "name")
+        pts = getattr(args, "pts")
         if not id:
             del args_dict["id"]
         if not name:
             del args_dict["name"]
+        if not pts:
+            del args_dict["pts"]
 
         # Create the instance
         instance = class_name(**args_dict)
@@ -88,5 +96,7 @@ def deserialize(self, data: str | bytes) -> Frame | None:
             setattr(instance, "id", getattr(args, "id"))
         if name:
             setattr(instance, "name", getattr(args, "name"))
+        if pts:
+            setattr(instance, "pts", getattr(args, "pts"))
 
         return instance
diff --git a/src/pipecat/serializers/twilio.py b/src/pipecat/serializers/twilio.py
index 583234ae4..ed2905a40 100644
--- a/src/pipecat/serializers/twilio.py
+++ b/src/pipecat/serializers/twilio.py
@@ -9,7 +9,10 @@
 
 from pydantic import BaseModel
 
-from pipecat.frames.frames import AudioRawFrame, Frame, StartInterruptionFrame
+from pipecat.frames.frames import (
+    AudioRawFrame,
+    Frame,
+    StartInterruptionFrame)
 from pipecat.serializers.base_serializer import FrameSerializer
 from pipecat.utils.audio import ulaw_to_pcm, pcm_to_ulaw
 
@@ -19,10 +22,6 @@ class InputParams(BaseModel):
         twilio_sample_rate: int = 8000
         sample_rate: int = 16000
 
-    SERIALIZABLE_TYPES = {
-        AudioRawFrame: "audio",
-    }
-
     def __init__(self, stream_sid: str, params: InputParams = InputParams()):
         self._stream_sid = stream_sid
         self._params = params
diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index d91782e42..c00064e5d 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -22,6 +22,7 @@
     STTModelUpdateFrame,
     StartFrame,
     StartInterruptionFrame,
+    TTSAudioRawFrame,
     TTSLanguageUpdateFrame,
     TTSModelUpdateFrame,
     TTSSpeakFrame,
@@ -277,7 +278,7 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect
         if self._push_stop_frames and (
                 isinstance(frame, StartInterruptionFrame) or
                 isinstance(frame, TTSStartedFrame) or
-                isinstance(frame, AudioRawFrame) or
+                isinstance(frame, TTSAudioRawFrame) or
                 isinstance(frame, TTSStoppedFrame)):
             await self._stop_frame_queue.put(frame)
 
diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py
index 90674fcc4..2c01fff3c 100644
--- a/src/pipecat/services/azure.py
+++ b/src/pipecat/services/azure.py
@@ -12,12 +12,12 @@
 from typing import AsyncGenerator
 
 from pipecat.frames.frames import (
-    AudioRawFrame,
     CancelFrame,
     EndFrame,
     ErrorFrame,
     Frame,
     StartFrame,
+    TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
     TranscriptionFrame,
@@ -115,7 +115,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
             await self.stop_ttfb_metrics()
             await self.push_frame(TTSStartedFrame())
             # Azure always sends a 44-byte header. Strip it off.
-            yield AudioRawFrame(audio=result.audio_data[44:], sample_rate=self._sample_rate, num_channels=1)
+            yield TTSAudioRawFrame(audio=result.audio_data[44:], sample_rate=self._sample_rate, num_channels=1)
             await self.push_frame(TTSStoppedFrame())
         elif result.reason == ResultReason.Canceled:
             cancellation_details = result.cancellation_details
diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index 078926235..ac0a1b85b 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -15,10 +15,10 @@
     CancelFrame,
     ErrorFrame,
     Frame,
-    AudioRawFrame,
     StartInterruptionFrame,
     StartFrame,
     EndFrame,
+    TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
     LLMFullResponseEndFrame
@@ -201,7 +201,7 @@ async def _receive_task_handler(self):
                 elif msg["type"] == "chunk":
                     await self.stop_ttfb_metrics()
                     self.start_word_timestamps()
-                    frame = AudioRawFrame(
+                    frame = TTSAudioRawFrame(
                         audio=base64.b64decode(msg["data"]),
                         sample_rate=self._output_format["sample_rate"],
                         num_channels=1
@@ -326,7 +326,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
 
             await self.stop_ttfb_metrics()
 
-            frame = AudioRawFrame(
+            frame = TTSAudioRawFrame(
                 audio=output["audio"],
                 sample_rate=self._output_format["sample_rate"],
                 num_channels=1
diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py
index 708c3c511..5ce17d6ee 100644
--- a/src/pipecat/services/deepgram.py
+++ b/src/pipecat/services/deepgram.py
@@ -9,13 +9,13 @@
 from typing import AsyncGenerator
 
 from pipecat.frames.frames import (
-    AudioRawFrame,
     CancelFrame,
     EndFrame,
     ErrorFrame,
     Frame,
     InterimTranscriptionFrame,
     StartFrame,
+    TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
     TranscriptionFrame)
@@ -101,7 +101,8 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 await self.push_frame(TTSStartedFrame())
                 async for data in r.content:
                     await self.stop_ttfb_metrics()
-                    frame = AudioRawFrame(audio=data, sample_rate=self._sample_rate, num_channels=1)
+                    frame = TTSAudioRawFrame(
+                        audio=data, sample_rate=self._sample_rate, num_channels=1)
                     yield frame
                 await self.push_frame(TTSStoppedFrame())
         except Exception as e:
diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py
index 081a6bf5d..f78eba266 100644
--- a/src/pipecat/services/elevenlabs.py
+++ b/src/pipecat/services/elevenlabs.py
@@ -12,12 +12,12 @@
 from pydantic import BaseModel
 
 from pipecat.frames.frames import (
-    AudioRawFrame,
     CancelFrame,
     EndFrame,
     Frame,
     StartFrame,
     StartInterruptionFrame,
+    TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame)
 from pipecat.processors.frame_processor import FrameDirection
@@ -209,7 +209,7 @@ async def _receive_task_handler(self):
                     self.start_word_timestamps()
 
                     audio = base64.b64decode(msg["audio"])
-                    frame = AudioRawFrame(audio, self.sample_rate, 1)
+                    frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
                     await self.push_frame(frame)
 
                 if msg.get("alignment"):
diff --git a/src/pipecat/services/lmnt.py b/src/pipecat/services/lmnt.py
index 60f0cb7df..9285f1583 100644
--- a/src/pipecat/services/lmnt.py
+++ b/src/pipecat/services/lmnt.py
@@ -10,13 +10,13 @@
 
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.frames.frames import (
-    AudioRawFrame,
     CancelFrame,
     EndFrame,
     ErrorFrame,
     Frame,
     StartFrame,
     StartInterruptionFrame,
+    TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
 )
@@ -126,7 +126,7 @@ async def _receive_task_handler(self):
                     await self.push_error(ErrorFrame(f'{self} error: {msg["error"]}'))
                 elif "audio" in msg:
                     await self.stop_ttfb_metrics()
-                    frame = AudioRawFrame(
+                    frame = TTSAudioRawFrame(
                         audio=msg["audio"],
                         sample_rate=self._output_format["sample_rate"],
                         num_channels=1
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index 7483e2eb5..8a1355916 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -17,13 +17,13 @@
 from PIL import Image
 
 from pipecat.frames.frames import (
-    AudioRawFrame,
     ErrorFrame,
     Frame,
     LLMFullResponseEndFrame,
     LLMFullResponseStartFrame,
     LLMMessagesFrame,
     LLMModelUpdateFrame,
+    TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
     TextFrame,
@@ -365,7 +365,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 async for chunk in r.iter_bytes(8192):
                     if len(chunk) > 0:
                         await self.stop_ttfb_metrics()
-                        frame = AudioRawFrame(chunk, self.sample_rate, 1)
+                        frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
                         yield frame
                 await self.push_frame(TTSStoppedFrame())
         except BadRequestError as e:
diff --git a/src/pipecat/services/playht.py b/src/pipecat/services/playht.py
index c3200fee9..ae8606e91 100644
--- a/src/pipecat/services/playht.py
+++ b/src/pipecat/services/playht.py
@@ -9,7 +9,11 @@
 
 from typing import AsyncGenerator
 
-from pipecat.frames.frames import AudioRawFrame, Frame, TTSStartedFrame, TTSStoppedFrame
+from pipecat.frames.frames import (
+    Frame,
+    TTSAudioRawFrame,
+    TTSStartedFrame,
+    TTSStoppedFrame)
 from pipecat.services.ai_services import TTSService
 
 from loguru import logger
@@ -91,7 +95,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 else:
                     if len(chunk):
                         await self.stop_ttfb_metrics()
-                        frame = AudioRawFrame(chunk, 16000, 1)
+                        frame = TTSAudioRawFrame(chunk, 16000, 1)
                         yield frame
             await self.push_frame(TTSStoppedFrame())
         except Exception as e:
diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py
index 49759cb01..dfd4c2966 100644
--- a/src/pipecat/services/together.py
+++ b/src/pipecat/services/together.py
@@ -4,23 +4,19 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-import base64
 import json
-import io
-import copy
-from typing import List, Optional
-from dataclasses import dataclass
-from asyncio import CancelledError
 import re
 import uuid
 
+from typing import List
+from dataclasses import dataclass
+from asyncio import CancelledError
+
 from pipecat.frames.frames import (
     Frame,
     LLMModelUpdateFrame,
     TextFrame,
-    VisionImageRawFrame,
     UserImageRequestFrame,
-    UserImageRawFrame,
     LLMMessagesFrame,
     LLMFullResponseStartFrame,
     LLMFullResponseEndFrame,
diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py
index 38f0f9a64..69b754f55 100644
--- a/src/pipecat/services/xtts.py
+++ b/src/pipecat/services/xtts.py
@@ -9,10 +9,10 @@
 from typing import Any, AsyncGenerator, Dict
 
 from pipecat.frames.frames import (
-    AudioRawFrame,
     ErrorFrame,
     Frame,
     StartFrame,
+    TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame)
 from pipecat.services.ai_services import TTSService
@@ -128,7 +128,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                         # Convert the numpy array back to bytes
                         resampled_audio_bytes = resampled_audio.astype(np.int16).tobytes()
                         # Create the frame with the resampled audio
-                        frame = AudioRawFrame(resampled_audio_bytes, 16000, 1)
+                        frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1)
                         yield frame
 
             # Process any remaining data in the buffer
@@ -136,7 +136,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 audio_np = np.frombuffer(buffer, dtype=np.int16)
                 resampled_audio = resampy.resample(audio_np, 24000, 16000)
                 resampled_audio_bytes = resampled_audio.astype(np.int16).tobytes()
-                frame = AudioRawFrame(resampled_audio_bytes, 16000, 1)
+                frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1)
                 yield frame
 
             await self.push_frame(TTSStoppedFrame())
diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py
index de1ec8884..4e398e779 100644
--- a/src/pipecat/transports/base_input.py
+++ b/src/pipecat/transports/base_input.py
@@ -10,9 +10,9 @@
 
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.frames.frames import (
-    AudioRawFrame,
     BotInterruptionFrame,
     CancelFrame,
+    InputAudioRawFrame,
     StartFrame,
     EndFrame,
     Frame,
@@ -59,7 +59,7 @@ async def cancel(self, frame: CancelFrame):
     def vad_analyzer(self) -> VADAnalyzer | None:
         return self._params.vad_analyzer
 
-    async def push_audio_frame(self, frame: AudioRawFrame):
+    async def push_audio_frame(self, frame: InputAudioRawFrame):
         if self._params.audio_in_enabled or self._params.vad_enabled:
             await self._audio_in_queue.put(frame)
 
@@ -151,7 +151,7 @@ async def _audio_task_handler(self):
         vad_state: VADState = VADState.QUIET
         while True:
             try:
-                frame: AudioRawFrame = await self._audio_in_queue.get()
+                frame: InputAudioRawFrame = await self._audio_in_queue.get()
 
                 audio_passthrough = True
 
diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py
index 9b1b9c29e..263bb64f4 100644
--- a/src/pipecat/transports/base_output.py
+++ b/src/pipecat/transports/base_output.py
@@ -15,17 +15,17 @@
 
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.frames.frames import (
-    AudioRawFrame,
     BotSpeakingFrame,
     BotStartedSpeakingFrame,
     BotStoppedSpeakingFrame,
     CancelFrame,
     MetricsFrame,
+    OutputAudioRawFrame,
+    OutputImageRawFrame,
     SpriteFrame,
     StartFrame,
     EndFrame,
     Frame,
-    ImageRawFrame,
     StartInterruptionFrame,
     StopInterruptionFrame,
     SystemFrame,
@@ -122,7 +122,7 @@ async def send_message(self, frame: TransportMessageFrame):
     async def send_metrics(self, frame: MetricsFrame):
         pass
 
-    async def write_frame_to_camera(self, frame: ImageRawFrame):
+    async def write_frame_to_camera(self, frame: OutputImageRawFrame):
         pass
 
     async def write_raw_audio_frames(self, frames: bytes):
@@ -162,9 +162,9 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             await self._sink_queue.put(frame)
             await self.stop(frame)
         # Other frames.
-        elif isinstance(frame, AudioRawFrame):
+        elif isinstance(frame, OutputAudioRawFrame):
             await self._handle_audio(frame)
-        elif isinstance(frame, ImageRawFrame) or isinstance(frame, SpriteFrame):
+        elif isinstance(frame, OutputImageRawFrame) or isinstance(frame, SpriteFrame):
             await self._handle_image(frame)
         elif isinstance(frame, TransportMessageFrame) and frame.urgent:
             await self.send_message(frame)
@@ -191,7 +191,7 @@ async def _handle_interruptions(self, frame: Frame):
             if self._bot_speaking:
                 await self._bot_stopped_speaking()
 
-    async def _handle_audio(self, frame: AudioRawFrame):
+    async def _handle_audio(self, frame: OutputAudioRawFrame):
         if not self._params.audio_out_enabled:
             return
 
@@ -200,12 +200,14 @@ async def _handle_audio(self, frame: AudioRawFrame):
         else:
             self._audio_buffer.extend(frame.audio)
             while len(self._audio_buffer) >= self._audio_chunk_size:
-                chunk = AudioRawFrame(bytes(self._audio_buffer[:self._audio_chunk_size]),
-                                      sample_rate=frame.sample_rate, num_channels=frame.num_channels)
+                chunk = OutputAudioRawFrame(
+                    bytes(self._audio_buffer[:self._audio_chunk_size]),
+                    sample_rate=frame.sample_rate, num_channels=frame.num_channels
+                )
                 await self._sink_queue.put(chunk)
                 self._audio_buffer = self._audio_buffer[self._audio_chunk_size:]
 
-    async def _handle_image(self, frame: ImageRawFrame | SpriteFrame):
+    async def _handle_image(self, frame: OutputImageRawFrame | SpriteFrame):
         if not self._params.camera_out_enabled:
             return
 
@@ -226,11 +228,11 @@ def _create_sink_tasks(self):
         self._sink_clock_task = loop.create_task(self._sink_clock_task_handler())
 
     async def _sink_frame_handler(self, frame: Frame):
-        if isinstance(frame, AudioRawFrame):
+        if isinstance(frame, OutputAudioRawFrame):
             await self.write_raw_audio_frames(frame.audio)
             await self.push_frame(frame)
             await self.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
-        elif isinstance(frame, ImageRawFrame):
+        elif isinstance(frame, OutputImageRawFrame):
             await self._set_camera_image(frame)
         elif isinstance(frame, SpriteFrame):
             await self._set_camera_images(frame.images)
@@ -305,10 +307,10 @@ async def _bot_stopped_speaking(self):
     # Camera out
     #
 
-    async def send_image(self, frame: ImageRawFrame | SpriteFrame):
+    async def send_image(self, frame: OutputImageRawFrame | SpriteFrame):
         await self.process_frame(frame, FrameDirection.DOWNSTREAM)
 
-    async def _draw_image(self, frame: ImageRawFrame):
+    async def _draw_image(self, frame: OutputImageRawFrame):
         desired_size = (self._params.camera_out_width, self._params.camera_out_height)
 
         if frame.size != desired_size:
@@ -316,14 +318,17 @@ async def _draw_image(self, frame: ImageRawFrame):
             resized_image = image.resize(desired_size)
             logger.warning(
                 f"{frame} does not have the expected size {desired_size}, resizing")
-            frame = ImageRawFrame(resized_image.tobytes(), resized_image.size, resized_image.format)
+            frame = OutputImageRawFrame(
+                resized_image.tobytes(),
+                resized_image.size,
+                resized_image.format)
 
         await self.write_frame_to_camera(frame)
 
-    async def _set_camera_image(self, image: ImageRawFrame):
+    async def _set_camera_image(self, image: OutputImageRawFrame):
         self._camera_images = itertools.cycle([image])
 
-    async def _set_camera_images(self, images: List[ImageRawFrame]):
+    async def _set_camera_images(self, images: List[OutputImageRawFrame]):
         self._camera_images = itertools.cycle(images)
 
     async def _camera_out_task_handler(self):
@@ -375,7 +380,7 @@ async def _camera_out_is_live_handler(self):
     # Audio out
     #
 
-    async def send_audio(self, frame: AudioRawFrame):
+    async def send_audio(self, frame: OutputAudioRawFrame):
         await self.process_frame(frame, FrameDirection.DOWNSTREAM)
 
     async def _audio_out_task_handler(self):
diff --git a/src/pipecat/transports/local/audio.py b/src/pipecat/transports/local/audio.py
index cd05550a9..45d18db52 100644
--- a/src/pipecat/transports/local/audio.py
+++ b/src/pipecat/transports/local/audio.py
@@ -8,7 +8,7 @@
 
 from concurrent.futures import ThreadPoolExecutor
 
-from pipecat.frames.frames import AudioRawFrame, StartFrame
+from pipecat.frames.frames import InputAudioRawFrame, StartFrame
 from pipecat.processors.frame_processor import FrameProcessor
 from pipecat.transports.base_input import BaseInputTransport
 from pipecat.transports.base_output import BaseOutputTransport
@@ -54,9 +54,9 @@ async def cleanup(self):
         self._in_stream.close()
 
     def _audio_in_callback(self, in_data, frame_count, time_info, status):
-        frame = AudioRawFrame(audio=in_data,
-                              sample_rate=self._params.audio_in_sample_rate,
-                              num_channels=self._params.audio_in_channels)
+        frame = InputAudioRawFrame(audio=in_data,
+                                   sample_rate=self._params.audio_in_sample_rate,
+                                   num_channels=self._params.audio_in_channels)
 
         asyncio.run_coroutine_threadsafe(self.push_audio_frame(frame), self.get_event_loop())
 
diff --git a/src/pipecat/transports/local/tk.py b/src/pipecat/transports/local/tk.py
index e7dc04902..75dd30331 100644
--- a/src/pipecat/transports/local/tk.py
+++ b/src/pipecat/transports/local/tk.py
@@ -11,8 +11,7 @@
 import numpy as np
 import tkinter as tk
 
-from pipecat.frames.frames import AudioRawFrame, ImageRawFrame, StartFrame
-from pipecat.processors.frame_processor import FrameProcessor
+from pipecat.frames.frames import InputAudioRawFrame, OutputImageRawFrame, StartFrame
 from pipecat.transports.base_input import BaseInputTransport
 from pipecat.transports.base_output import BaseOutputTransport
 from pipecat.transports.base_transport import BaseTransport, TransportParams
@@ -64,9 +63,9 @@ async def cleanup(self):
         self._in_stream.close()
 
     def _audio_in_callback(self, in_data, frame_count, time_info, status):
-        frame = AudioRawFrame(audio=in_data,
-                              sample_rate=self._params.audio_in_sample_rate,
-                              num_channels=self._params.audio_in_channels)
+        frame = InputAudioRawFrame(audio=in_data,
+                                   sample_rate=self._params.audio_in_sample_rate,
+                                   num_channels=self._params.audio_in_channels)
 
         asyncio.run_coroutine_threadsafe(self.push_audio_frame(frame), self.get_event_loop())
 
@@ -108,10 +107,10 @@ async def cleanup(self):
     async def write_raw_audio_frames(self, frames: bytes):
         await self.get_event_loop().run_in_executor(self._executor, self._out_stream.write, frames)
 
-    async def write_frame_to_camera(self, frame: ImageRawFrame):
+    async def write_frame_to_camera(self, frame: OutputImageRawFrame):
         self.get_event_loop().call_soon(self._write_frame_to_tk, frame)
 
-    def _write_frame_to_tk(self, frame: ImageRawFrame):
+    def _write_frame_to_tk(self, frame: OutputImageRawFrame):
         width = frame.size[0]
         height = frame.size[1]
         data = f"P6 {width} {height} 255 ".encode() + frame.image
diff --git a/src/pipecat/transports/network/fastapi_websocket.py b/src/pipecat/transports/network/fastapi_websocket.py
index 7169c73bd..815d7c2ef 100644
--- a/src/pipecat/transports/network/fastapi_websocket.py
+++ b/src/pipecat/transports/network/fastapi_websocket.py
@@ -12,8 +12,16 @@
 from typing import Awaitable, Callable
 from pydantic.main import BaseModel
 
-from pipecat.frames.frames import AudioRawFrame, CancelFrame, EndFrame, Frame, StartFrame, StartInterruptionFrame
-from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.frames.frames import (
+    AudioRawFrame,
+    CancelFrame,
+    EndFrame,
+    Frame,
+    InputAudioRawFrame,
+    StartFrame,
+    StartInterruptionFrame
+)
+from pipecat.processors.frame_processor import FrameDirection
 from pipecat.serializers.base_serializer import FrameSerializer
 from pipecat.transports.base_input import BaseInputTransport
 from pipecat.transports.base_output import BaseOutputTransport
@@ -79,7 +87,11 @@ async def _receive_messages(self):
                 continue
 
             if isinstance(frame, AudioRawFrame):
-                await self.push_audio_frame(frame)
+                await self.push_audio_frame(InputAudioRawFrame(
+                    audio=frame.audio,
+                    sample_rate=frame.sample_rate,
+                    num_channels=frame.num_channels)
+                )
 
         await self._callbacks.on_client_disconnected(self._websocket)
 
diff --git a/src/pipecat/transports/network/websocket_server.py b/src/pipecat/transports/network/websocket_server.py
index c17818898..329ae8994 100644
--- a/src/pipecat/transports/network/websocket_server.py
+++ b/src/pipecat/transports/network/websocket_server.py
@@ -11,8 +11,7 @@
 from typing import Awaitable, Callable
 from pydantic.main import BaseModel
 
-from pipecat.frames.frames import AudioRawFrame, CancelFrame, EndFrame, StartFrame
-from pipecat.processors.frame_processor import FrameProcessor
+from pipecat.frames.frames import AudioRawFrame, CancelFrame, EndFrame, InputAudioRawFrame, StartFrame
 from pipecat.serializers.base_serializer import FrameSerializer
 from pipecat.serializers.protobuf import ProtobufFrameSerializer
 from pipecat.transports.base_input import BaseInputTransport
@@ -98,7 +97,11 @@ async def _client_handler(self, websocket: websockets.WebSocketServerProtocol, p
                 continue
 
             if isinstance(frame, AudioRawFrame):
-                await self.queue_audio_frame(frame)
+                await self.push_audio_frame(InputAudioRawFrame(
+                    audio=frame.audio,
+                    sample_rate=frame.sample_rate,
+                    num_channels=frame.num_channels)
+                )
             else:
                 await self.push_frame(frame)
 
diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py
index 2a45adf36..d595aaceb 100644
--- a/src/pipecat/transports/services/daily.py
+++ b/src/pipecat/transports/services/daily.py
@@ -22,13 +22,14 @@
 from pydantic.main import BaseModel
 
 from pipecat.frames.frames import (
-    AudioRawFrame,
     CancelFrame,
     EndFrame,
     Frame,
-    ImageRawFrame,
+    InputAudioRawFrame,
     InterimTranscriptionFrame,
     MetricsFrame,
+    OutputAudioRawFrame,
+    OutputImageRawFrame,
     SpriteFrame,
     StartFrame,
     TranscriptionFrame,
@@ -239,7 +240,7 @@ async def send_message(self, frame: TransportMessageFrame):
             completion=completion_callback(future))
         await future
 
-    async def read_next_audio_frame(self) -> AudioRawFrame | None:
+    async def read_next_audio_frame(self) -> InputAudioRawFrame | None:
         if not self._speaker:
             return None
 
@@ -252,7 +253,10 @@ async def read_next_audio_frame(self) -> AudioRawFrame | None:
         audio = await future
 
         if len(audio) > 0:
-            return AudioRawFrame(audio=audio, sample_rate=sample_rate, num_channels=num_channels)
+            return InputAudioRawFrame(
+                audio=audio,
+                sample_rate=sample_rate,
+                num_channels=num_channels)
         else:
             # If we don't read any audio it could be there's no participant
             # connected. daily-python will return immediately if that's the
@@ -268,7 +272,7 @@ async def write_raw_audio_frames(self, frames: bytes):
         self._mic.write_frames(frames, completion=completion_callback(future))
         await future
 
-    async def write_frame_to_camera(self, frame: ImageRawFrame):
+    async def write_frame_to_camera(self, frame: OutputImageRawFrame):
         if not self._camera:
             return None
 
@@ -749,7 +753,7 @@ async def send_metrics(self, frame: MetricsFrame):
     async def write_raw_audio_frames(self, frames: bytes):
         await self._client.write_raw_audio_frames(frames)
 
-    async def write_frame_to_camera(self, frame: ImageRawFrame):
+    async def write_frame_to_camera(self, frame: OutputImageRawFrame):
         await self._client.write_frame_to_camera(frame)
 
 
@@ -829,11 +833,11 @@ def output(self) -> DailyOutputTransport:
     def participant_id(self) -> str:
         return self._client.participant_id
 
-    async def send_image(self, frame: ImageRawFrame | SpriteFrame):
+    async def send_image(self, frame: OutputImageRawFrame | SpriteFrame):
         if self._output:
             await self._output.process_frame(frame, FrameDirection.DOWNSTREAM)
 
-    async def send_audio(self, frame: AudioRawFrame):
+    async def send_audio(self, frame: OutputAudioRawFrame):
         if self._output:
             await self._output.process_frame(frame, FrameDirection.DOWNSTREAM)
 

From 3b81cd462d36c83b5d741aed253b56932a9ae06c Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Fri, 20 Sep 2024 13:41:04 -0400
Subject: [PATCH 05/10] Input params to OpenAI LLM

---
 src/pipecat/services/openai.py | 63 ++++++++++++++++++++++++++++++----
 1 file changed, 57 insertions(+), 6 deletions(-)

diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index 6fe710b5e..440281c90 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -11,7 +11,8 @@
 import httpx
 from dataclasses import dataclass
 
-from typing import AsyncGenerator, Dict, List, Literal
+from typing import AsyncGenerator, Dict, List, Literal, Optional
+from pydantic import BaseModel, Field
 
 from loguru import logger
 from PIL import Image
@@ -48,7 +49,7 @@
 )
 
 try:
-    from openai import AsyncOpenAI, AsyncStream, DefaultAsyncHttpxClient, BadRequestError
+    from openai import AsyncOpenAI, AsyncStream, DefaultAsyncHttpxClient, BadRequestError, NOT_GIVEN
     from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
@@ -81,11 +82,31 @@ class BaseOpenAILLMService(LLMService):
     as well as tool choices and the tool, which is used if requesting function
     calls from the LLM.
     """
+    class InputParams(BaseModel):
+        frequency_penalty: Optional[float] = Field(
+            default_factory=lambda: NOT_GIVEN, ge=-2.0, le=2.0)
+        presence_penalty: Optional[float] = Field(
+            default_factory=lambda: NOT_GIVEN, ge=-2.0, le=2.0)
+        seed: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=0)
+        temperature: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=2.0)
+        top_p: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=1.0)
 
-    def __init__(self, *, model: str, api_key=None, base_url=None, **kwargs):
+    def __init__(
+            self,
+            *,
+            model: str,
+            api_key=None,
+            base_url=None,
+            params: InputParams = InputParams(),
+            **kwargs):
         super().__init__(**kwargs)
         self.set_model_name(model)
         self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs)
+        self._frequency_penalty = params.frequency_penalty
+        self._presence_penalty = params.presence_penalty
+        self._seed = params.seed
+        self._temperature = params.temperature
+        self._top_p = params.top_p
 
     def create_client(self, api_key=None, base_url=None, **kwargs):
         return AsyncOpenAI(
@@ -100,6 +121,26 @@ def create_client(self, api_key=None, base_url=None, **kwargs):
     def can_generate_metrics(self) -> bool:
         return True
 
+    async def set_frequency_penalty(self, frequency_penalty: float):
+        logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]")
+        self._frequency_penalty = frequency_penalty
+
+    async def set_presence_penalty(self, presence_penalty: float):
+        logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]")
+        self._presence_penalty = presence_penalty
+
+    async def set_seed(self, seed: int):
+        logger.debug(f"Switching LLM seed to: [{seed}]")
+        self._seed = seed
+
+    async def set_temperature(self, temperature: float):
+        logger.debug(f"Switching LLM temperature to: [{temperature}]")
+        self._temperature = temperature
+
+    async def set_top_p(self, top_p: float):
+        logger.debug(f"Switching LLM top_p to: [{top_p}]")
+        self._top_p = top_p
+
     async def get_chat_completions(
             self,
             context: OpenAILLMContext,
@@ -110,7 +151,12 @@ async def get_chat_completions(
             messages=messages,
             tools=context.tools,
             tool_choice=context.tool_choice,
-            stream_options={"include_usage": True}
+            stream_options={"include_usage": True},
+            frequency_penalty=self._frequency_penalty,
+            presence_penalty=self._presence_penalty,
+            seed=self._seed,
+            temperature=self._temperature,
+            top_p=self._top_p
         )
         return chunks
 
@@ -248,8 +294,13 @@ def assistant(self) -> 'OpenAIAssistantContextAggregator':
 
 class OpenAILLMService(BaseOpenAILLMService):
 
-    def __init__(self, *, model: str = "gpt-4o", **kwargs):
-        super().__init__(model=model, **kwargs)
+    def __init__(
+            self,
+            *,
+            model: str = "gpt-4o",
+            params: BaseOpenAILLMService.InputParams = BaseOpenAILLMService.InputParams(),
+            **kwargs):
+        super().__init__(model=model, params=params, **kwargs)
 
     @staticmethod
     def create_context_aggregator(context: OpenAILLMContext) -> OpenAIContextAggregatorPair:

From 4fa1ea8c4b7e2d5b156bc19c4f38e3f2ebc0c0ed Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Fri, 20 Sep 2024 15:29:20 -0400
Subject: [PATCH 06/10] Input params for Anthropic LLM

---
 src/pipecat/services/anthropic.py | 42 +++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py
index 7935691ce..ea1756f8c 100644
--- a/src/pipecat/services/anthropic.py
+++ b/src/pipecat/services/anthropic.py
@@ -13,6 +13,7 @@
 from PIL import Image
 from asyncio import CancelledError
 import re
+from pydantic import BaseModel, Field
 
 from pipecat.frames.frames import (
     Frame,
@@ -74,20 +75,28 @@ def assistant(self) -> 'AnthropicAssistantContextAggregator':
 class AnthropicLLMService(LLMService):
     """This class implements inference with Anthropic's AI models
     """
+    class InputParams(BaseModel):
+        enable_prompt_caching_beta: Optional[bool] = False
+        max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1)
+        temperature: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=1.0)
+        top_k: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=0)
+        top_p: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=1.0)
 
     def __init__(
             self,
             *,
             api_key: str,
             model: str = "claude-3-5-sonnet-20240620",
-            max_tokens: int = 4096,
-            enable_prompt_caching_beta: bool = False,
+            params: InputParams = InputParams(),
             **kwargs):
         super().__init__(**kwargs)
         self._client = AsyncAnthropic(api_key=api_key)
         self.set_model_name(model)
-        self._max_tokens = max_tokens
-        self._enable_prompt_caching_beta = enable_prompt_caching_beta
+        self._max_tokens = params.max_tokens
+        self._enable_prompt_caching_beta: bool = params.enable_prompt_caching_beta or False
+        self._temperature = params.temperature
+        self._top_k = params.top_k
+        self._top_p = params.top_p
 
     def can_generate_metrics(self) -> bool:
         return True
@@ -105,6 +114,26 @@ def create_context_aggregator(context: OpenAILLMContext) -> AnthropicContextAggr
             _assistant=assistant
         )
 
+    async def set_enable_prompt_caching_beta(self, enable_prompt_caching_beta: bool):
+        logger.debug(f"Switching LLM enable_prompt_caching_beta to: [{enable_prompt_caching_beta}]")
+        self._enable_prompt_caching_beta = enable_prompt_caching_beta
+
+    async def set_max_tokens(self, max_tokens: int):
+        logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]")
+        self._max_tokens = max_tokens
+
+    async def set_temperature(self, temperature: float):
+        logger.debug(f"Switching LLM temperature to: [{temperature}]")
+        self._temperature = temperature
+
+    async def set_top_k(self, top_k: float):
+        logger.debug(f"Switching LLM top_k to: [{top_k}]")
+        self._top_k = top_k
+
+    async def set_top_p(self, top_p: float):
+        logger.debug(f"Switching LLM top_p to: [{top_p}]")
+        self._top_p = top_p
+
     async def _process_context(self, context: OpenAILLMContext):
         # Usage tracking. We track the usage reported by Anthropic in prompt_tokens and
         # completion_tokens. We also estimate the completion tokens from output text
@@ -140,7 +169,10 @@ async def _process_context(self, context: OpenAILLMContext):
                 messages=messages,
                 model=self.model_name,
                 max_tokens=self._max_tokens,
-                stream=True)
+                stream=True,
+                temperature=self._temperature,
+                top_k=self._top_k,
+                top_p=self._top_p)
 
             await self.stop_ttfb_metrics()
 

From 357e66d64da74a0fc66ce45f2f9fcc0f6fb3fc0d Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Fri, 20 Sep 2024 16:18:25 -0400
Subject: [PATCH 07/10] Input params for Together AI LLM

---
 src/pipecat/services/together.py | 46 ++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py
index 004236ac8..6bc3980ce 100644
--- a/src/pipecat/services/together.py
+++ b/src/pipecat/services/together.py
@@ -13,6 +13,7 @@
 from asyncio import CancelledError
 import re
 import uuid
+from pydantic import BaseModel, Field
 
 from pipecat.frames.frames import (
     Frame,
@@ -58,18 +59,30 @@ def assistant(self) -> 'TogetherAssistantContextAggregator':
 class TogetherLLMService(LLMService):
     """This class implements inference with Together's Llama 3.1 models
     """
+    class InputParams(BaseModel):
+        frequency_penalty: Optional[float] = Field(default=None, ge=-2.0, le=2.0)
+        max_tokens: Optional[int] = Field(default=4096, ge=1)
+        presence_penalty: Optional[float] = Field(default=None, ge=-2.0, le=2.0)
+        temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0)
+        top_k: Optional[int] = Field(default=None, ge=0)
+        top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
 
     def __init__(
             self,
             *,
             api_key: str,
             model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
-            max_tokens: int = 4096,
+            params: InputParams = InputParams(),
             **kwargs):
         super().__init__(**kwargs)
         self._client = AsyncTogether(api_key=api_key)
         self.set_model_name(model)
-        self._max_tokens = max_tokens
+        self._max_tokens = params.max_tokens
+        self._frequency_penalty = params.frequency_penalty
+        self._presence_penalty = params.presence_penalty
+        self._temperature = params.temperature
+        self._top_k = params.top_k
+        self._top_p = params.top_p
 
     def can_generate_metrics(self) -> bool:
         return True
@@ -83,6 +96,30 @@ def create_context_aggregator(context: OpenAILLMContext) -> TogetherContextAggre
             _assistant=assistant
         )
 
+    async def set_frequency_penalty(self, frequency_penalty: float):
+        logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]")
+        self._frequency_penalty = frequency_penalty
+
+    async def set_max_tokens(self, max_tokens: int):
+        logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]")
+        self._max_tokens = max_tokens
+
+    async def set_presence_penalty(self, presence_penalty: float):
+        logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]")
+        self._presence_penalty = presence_penalty
+
+    async def set_temperature(self, temperature: float):
+        logger.debug(f"Switching LLM temperature to: [{temperature}]")
+        self._temperature = temperature
+
+    async def set_top_k(self, top_k: float):
+        logger.debug(f"Switching LLM top_k to: [{top_k}]")
+        self._top_k = top_k
+
+    async def set_top_p(self, top_p: float):
+        logger.debug(f"Switching LLM top_p to: [{top_p}]")
+        self._top_p = top_p
+
     async def _process_context(self, context: OpenAILLMContext):
         try:
             await self.push_frame(LLMFullResponseStartFrame())
@@ -97,6 +134,11 @@ async def _process_context(self, context: OpenAILLMContext):
                 model=self.model_name,
                 max_tokens=self._max_tokens,
                 stream=True,
+                frequency_penalty=self._frequency_penalty,
+                presence_penalty=self._presence_penalty,
+                temperature=self._temperature,
+                top_k=self._top_k,
+                top_p=self._top_p
             )
 
             # Function calling

From f3fd312b8353bd9211fbe7c48bed0b0e8ad70c36 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Fri, 20 Sep 2024 17:42:58 -0400
Subject: [PATCH 08/10] Add Together AI interruptible example

---
 .../07l-interruptible-together.py             | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 examples/foundational/07l-interruptible-together.py

diff --git a/examples/foundational/07l-interruptible-together.py b/examples/foundational/07l-interruptible-together.py
new file mode 100644
index 000000000..41befb67f
--- /dev/null
+++ b/examples/foundational/07l-interruptible-together.py
@@ -0,0 +1,100 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import aiohttp
+import os
+import sys
+
+from pipecat.frames.frames import LLMMessagesFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.llm_response import (
+    LLMAssistantResponseAggregator, LLMUserResponseAggregator)
+from pipecat.services.cartesia import CartesiaTTSService
+from pipecat.services.together import TogetherLLMService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+from pipecat.vad.silero import SileroVADAnalyzer
+
+from runner import configure
+
+from loguru import logger
+
+from dotenv import load_dotenv
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        (room_url, token) = await configure(session)
+
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_out_enabled=True,
+                transcription_enabled=True,
+                vad_enabled=True,
+                vad_analyzer=SileroVADAnalyzer()
+            )
+        )
+
+        tts = CartesiaTTSService(
+            api_key=os.getenv("CARTESIA_API_KEY"),
+            voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
+        )
+
+        llm = TogetherLLMService(
+            api_key=os.getenv("TOGETHER_API_KEY"),
+            model=os.getenv("TOGETHER_MODEL"),
+            params=TogetherLLMService.InputParams(
+                temperature=1.0,
+                frequency_penalty=2.0,
+                presence_penalty=0.0,
+                top_p=0.9,
+                top_k=40
+            )
+        )
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        tma_in = LLMUserResponseAggregator(messages)
+        tma_out = LLMAssistantResponseAggregator(messages)
+
+        pipeline = Pipeline([
+            transport.input(),   # Transport user input
+            tma_in,              # User responses
+            llm,                 # LLM
+            tts,                 # TTS
+            transport.output(),  # Transport bot output
+            tma_out              # Assistant spoken responses
+        ])
+
+        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            transport.capture_participant_transcription(participant["id"])
+            # Kick off the conversation.
+            await task.queue_frames([LLMMessagesFrame(messages)])
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 219304c5eeeeb36e773222bd46631119301027a8 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Fri, 20 Sep 2024 20:31:42 -0400
Subject: [PATCH 09/10] Added Changelog entries

---
 CHANGELOG.md | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 985834aba..ba20821cc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added configurable LLM parameters (e.g., temperature, top_p, max_tokens, seed)
+  for OpenAI, Anthropic, and Together AI services along with corresponding
+  setter functions.
+
+- Added `sample_rate` as a constructor parameter for TTS services.
+
 - Pipecat has a pipeline-based architecture. The pipeline consists of frame
   processors linked to each other. The elements traveling across the pipeline
   are called frames.
@@ -334,7 +340,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - It is now possible to specify a Silero VAD version when using `SileroVADAnalyzer`
   or `SileroVAD`.
 
-- Added `AysncFrameProcessor` and `AsyncAIService`.  Some services like
+- Added `AysncFrameProcessor` and `AsyncAIService`. Some services like
   `DeepgramSTTService` need to process things asynchronously. For example, audio
   is sent to Deepgram but transcriptions are not returned immediately. In these
   cases we still require all frames (except system frames) to be pushed
@@ -351,7 +357,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - `WhisperSTTService` model can now also be a string.
 
-- Added missing * keyword separators in services.
+- Added missing \* keyword separators in services.
 
 ### Fixed
 
@@ -428,7 +434,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added new `TwilioFrameSerializer`. This is a new serializer that knows how to
   serialize and deserialize audio frames from Twilio.
 
-- Added Daily transport event: `on_dialout_answered`.  See
+- Added Daily transport event: `on_dialout_answered`. See
   https://reference-python.daily.co/api_reference.html#daily.EventHandler
 
 - Added new `AzureSTTService`. This allows you to use Azure Speech-To-Text.
@@ -668,7 +674,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added Daily transport support for dial-in use cases.
 
 - Added Daily transport events: `on_dialout_connected`, `on_dialout_stopped`,
-  `on_dialout_error` and `on_dialout_warning`.  See
+  `on_dialout_error` and `on_dialout_warning`. See
   https://reference-python.daily.co/api_reference.html#daily.EventHandler
 
 ## [0.0.21] - 2024-05-22

From 78a3f081de561097346e72bed2e7313597bbf2a3 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Fri, 20 Sep 2024 18:21:06 -0700
Subject: [PATCH 10/10] fixup for serialization issue

---
 src/pipecat/transports/services/daily.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py
index e28fe6083..66d3b21d2 100644
--- a/src/pipecat/transports/services/daily.py
+++ b/src/pipecat/transports/services/daily.py
@@ -736,11 +736,11 @@ async def send_metrics(self, frame: MetricsFrame):
             if isinstance(d, TTFBMetricsData):
                 if "ttfb" not in metrics:
                     metrics["ttfb"] = []
-                metrics["ttfb"].append(d.model_dump())
+                metrics["ttfb"].append(d.model_dump(exclude_none=True))
             elif isinstance(d, ProcessingMetricsData):
                 if "processing" not in metrics:
                     metrics["processing"] = []
-                metrics["processing"].append(d.model_dump())
+                metrics["processing"].append(d.model_dump(exclude_none=True))
             elif isinstance(d, LLMUsageMetricsData):
                 if "tokens" not in metrics:
                     metrics["tokens"] = []
@@ -748,7 +748,7 @@ async def send_metrics(self, frame: MetricsFrame):
             elif isinstance(d, TTSUsageMetricsData):
                 if "characters" not in metrics:
                     metrics["characters"] = []
-                metrics["characters"].append(d.model_dump())
+                metrics["characters"].append(d.model_dump(exclude_none=True))
 
         message = DailyTransportMessageFrame(message={
             "type": "pipecat-metrics",