introduce input/output audio and image frames

We now distinguish between input and output audio and image frames. We introduce `InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame` and `OutputImageRawFrame` (and other subclasses of those). The input frames usually come from an input transport and are meant to be processed inside the pipeline to generate new frames. However, the input frames will not be sent through an output transport. The output frames can also be processed by any frame processor in the pipeline and they are allowed to be sent by the output transport.
pipecat-ai · Sep 20, 2024 · 7e39d9a · 7e39d9a
1 parent ed409d0
commit 7e39d9a
Show file tree

Hide file tree

Showing 48 changed files with 425 additions and 275 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -63,6 +63,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- We now distinguish between input and output audio and image frames. We
+  introduce `InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame`
+  and `OutputImageRawFrame` (and other subclasses of those). The input frames
+  usually come from an input transport and are meant to be processed inside the
+  pipeline to generate new frames. However, the input frames will not be sent
+  through an output transport. The output frames can also be processed by any
+  frame processor in the pipeline and they are allowed to be sent by the output
+  transport.
+
 - `ParallelTask` has been renamed to `SyncParallelPipeline`. A
   `SyncParallelPipeline` is a frame processor that contains a list of different
   pipelines to be executed concurrently. The difference between a

diff --git a/examples/dialin-chatbot/requirements.txt b/examples/dialin-chatbot/requirements.txt
@@ -1,4 +1,4 @@
-pipecat-ai[daily,openai,silero]
+pipecat-ai[daily,elevenlabs,openai,silero]
 fastapi
 uvicorn
 python-dotenv

diff --git a/examples/foundational/05a-local-sync-speech-and-image.py b/examples/foundational/05a-local-sync-speech-and-image.py
@@ -11,7 +11,13 @@
 
 import tkinter as tk
 
-from pipecat.frames.frames import AudioRawFrame, Frame, URLImageRawFrame, LLMMessagesFrame, TextFrame
+from pipecat.frames.frames import (
+    Frame,
+    OutputAudioRawFrame,
+    TTSAudioRawFrame,
+    URLImageRawFrame,
+    LLMMessagesFrame,
+    TextFrame)
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline
@@ -65,9 +71,9 @@ def __init__(self):
                 async def process_frame(self, frame: Frame, direction: FrameDirection):
                     await super().process_frame(frame, direction)
 
-                    if isinstance(frame, AudioRawFrame):
+                    if isinstance(frame, TTSAudioRawFrame):
                         self.audio.extend(frame.audio)
-                        self.frame = AudioRawFrame(
+                        self.frame = OutputAudioRawFrame(
                             bytes(self.audio), frame.sample_rate, frame.num_channels)
 
             class ImageGrabber(FrameProcessor):

diff --git a/examples/foundational/06a-image-sync.py b/examples/foundational/06a-image-sync.py
@@ -11,7 +11,7 @@
 
 from PIL import Image
 
-from pipecat.frames.frames import ImageRawFrame, Frame, SystemFrame, TextFrame
+from pipecat.frames.frames import Frame, OutputImageRawFrame, SystemFrame, TextFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
@@ -52,9 +52,16 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
         if not isinstance(frame, SystemFrame) and direction == FrameDirection.DOWNSTREAM:
-            await self.push_frame(ImageRawFrame(image=self._speaking_image_bytes, size=(1024, 1024), format=self._speaking_image_format))
+            await self.push_frame(OutputImageRawFrame(
+                image=self._speaking_image_bytes,
+                size=(1024, 1024),
+                format=self._speaking_image_format)
+            )
             await self.push_frame(frame)
-            await self.push_frame(ImageRawFrame(image=self._waiting_image_bytes, size=(1024, 1024), format=self._waiting_image_format))
+            await self.push_frame(OutputImageRawFrame(
+                image=self._waiting_image_bytes,
+                size=(1024, 1024),
+                format=self._waiting_image_format))
         else:
             await self.push_frame(frame)
 

diff --git a/examples/foundational/09-mirror.py b/examples/foundational/09-mirror.py
@@ -8,9 +8,11 @@
 import asyncio
 import sys
 
+from pipecat.frames.frames import Frame, InputAudioRawFrame, InputImageRawFrame, OutputAudioRawFrame, OutputImageRawFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transports.services.daily import DailyTransport, DailyParams
 
 from runner import configure
@@ -24,6 +26,27 @@
 logger.add(sys.stderr, level="DEBUG")
 
 
+class MirrorProcessor(FrameProcessor):
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, InputAudioRawFrame):
+            await self.push_frame(OutputAudioRawFrame(
+                audio=frame.audio,
+                sample_rate=frame.sample_rate,
+                num_channels=frame.num_channels)
+            )
+        elif isinstance(frame, InputImageRawFrame):
+            await self.push_frame(OutputImageRawFrame(
+                image=frame.image,
+                size=frame.size,
+                format=frame.format)
+            )
+        else:
+            await self.push_frame(frame, direction)
+
+
 async def main():
     async with aiohttp.ClientSession() as session:
         (room_url, token) = await configure(session)
@@ -44,7 +67,7 @@ async def main():
         async def on_first_participant_joined(transport, participant):
             transport.capture_participant_video(participant["id"])
 
-        pipeline = Pipeline([transport.input(), transport.output()])
+        pipeline = Pipeline([transport.input(), MirrorProcessor(), transport.output()])
 
         runner = PipelineRunner()
 

diff --git a/examples/foundational/09a-local-mirror.py b/examples/foundational/09a-local-mirror.py
@@ -10,9 +10,11 @@
 
 import tkinter as tk
 
+from pipecat.frames.frames import Frame, InputAudioRawFrame, InputImageRawFrame, OutputAudioRawFrame, OutputImageRawFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.local.tk import TkLocalTransport
 from pipecat.transports.services.daily import DailyParams, DailyTransport
@@ -27,6 +29,25 @@
 logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")
 
+class MirrorProcessor(FrameProcessor):
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, InputAudioRawFrame):
+            await self.push_frame(OutputAudioRawFrame(
+                audio=frame.audio,
+                sample_rate=frame.sample_rate,
+                num_channels=frame.num_channels)
+            )
+        elif isinstance(frame, InputImageRawFrame):
+            await self.push_frame(OutputImageRawFrame(
+                image=frame.image,
+                size=frame.size,
+                format=frame.format)
+            )
+        else:
+            await self.push_frame(frame, direction)
 
 async def main():
     async with aiohttp.ClientSession() as session:
@@ -52,7 +73,7 @@ async def main():
         async def on_first_participant_joined(transport, participant):
             transport.capture_participant_video(participant["id"])
 
-        pipeline = Pipeline([daily_transport.input(), tk_transport.output()])
+        pipeline = Pipeline([daily_transport.input(), MirrorProcessor(), tk_transport.output()])
 
         task = PipelineTask(pipeline)
 

diff --git a/examples/foundational/11-sound-effects.py b/examples/foundational/11-sound-effects.py
@@ -12,9 +12,9 @@
 
 from pipecat.frames.frames import (
     Frame,
-    AudioRawFrame,
     LLMFullResponseEndFrame,
     LLMMessagesFrame,
+    OutputAudioRawFrame,
 )
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
@@ -53,8 +53,8 @@
     filename = os.path.splitext(os.path.basename(full_path))[0]
     # Open the image and convert it to bytes
     with wave.open(full_path) as audio_file:
-        sounds[file] = AudioRawFrame(audio_file.readframes(-1),
-                                     audio_file.getframerate(), audio_file.getnchannels())
+        sounds[file] = OutputAudioRawFrame(audio_file.readframes(-1),
+                                           audio_file.getframerate(), audio_file.getnchannels())
 
 
 class OutboundSoundEffectWrapper(FrameProcessor):

diff --git a/examples/moondream-chatbot/bot.py b/examples/moondream-chatbot/bot.py
@@ -13,10 +13,11 @@
 
 from pipecat.frames.frames import (
     ImageRawFrame,
+    OutputImageRawFrame,
     SpriteFrame,
     Frame,
     LLMMessagesFrame,
-    AudioRawFrame,
+    TTSAudioRawFrame,
     TTSStoppedFrame,
     TextFrame,
     UserImageRawFrame,
@@ -59,7 +60,11 @@
     # Get the filename without the extension to use as the dictionary key
     # Open the image and convert it to bytes
     with Image.open(full_path) as img:
-        sprites.append(ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
+        sprites.append(OutputImageRawFrame(
+            image=img.tobytes(),
+            size=img.size,
+            format=img.format)
+        )
 
 flipped = sprites[::-1]
 sprites.extend(flipped)
@@ -82,7 +87,7 @@ def __init__(self):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
-        if isinstance(frame, AudioRawFrame):
+        if isinstance(frame, TTSAudioRawFrame):
             if not self._is_talking:
                 await self.push_frame(talking_frame)
                 self._is_talking = True

diff --git a/examples/moondream-chatbot/requirements.txt b/examples/moondream-chatbot/requirements.txt
@@ -1,4 +1,4 @@
 python-dotenv
 fastapi[all]
 uvicorn
-pipecat-ai[daily,moondream,openai,silero]
+pipecat-ai[daily,cartesia,moondream,openai,silero]
diff --git a/examples/patient-intake/bot.py b/examples/patient-intake/bot.py
@@ -10,7 +10,7 @@
 import sys
 import wave
 
-from pipecat.frames.frames import AudioRawFrame
+from pipecat.frames.frames import OutputAudioRawFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -49,8 +49,9 @@
     filename = os.path.splitext(os.path.basename(full_path))[0]
     # Open the sound and convert it to bytes
     with wave.open(full_path) as audio_file:
-        sounds[file] = AudioRawFrame(audio_file.readframes(-1),
-                                     audio_file.getframerate(), audio_file.getnchannels())
+        sounds[file] = OutputAudioRawFrame(audio_file.readframes(-1),
+                                           audio_file.getframerate(),
+                                           audio_file.getnchannels())
 
 
 class IntakeProcessor:

diff --git a/examples/patient-intake/requirements.txt b/examples/patient-intake/requirements.txt
@@ -1,4 +1,4 @@
 python-dotenv
 fastapi[all]
 uvicorn
-pipecat-ai[daily,openai,silero]
+pipecat-ai[daily,cartesia,openai,silero]
diff --git a/examples/simple-chatbot/bot.py b/examples/simple-chatbot/bot.py
@@ -16,11 +16,11 @@
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
 from pipecat.frames.frames import (
-    AudioRawFrame,
-    ImageRawFrame,
+    OutputImageRawFrame,
     SpriteFrame,
     Frame,
     LLMMessagesFrame,
+    TTSAudioRawFrame,
     TTSStoppedFrame
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -49,7 +49,11 @@
     # Get the filename without the extension to use as the dictionary key
     # Open the image and convert it to bytes
     with Image.open(full_path) as img:
-        sprites.append(ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
+        sprites.append(OutputImageRawFrame(
+            image=img.tobytes(),
+            size=img.size,
+            format=img.format)
+        )
 
 flipped = sprites[::-1]
 sprites.extend(flipped)
@@ -72,7 +76,7 @@ def __init__(self):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
-        if isinstance(frame, AudioRawFrame):
+        if isinstance(frame, TTSAudioRawFrame):
             if not self._is_talking:
                 await self.push_frame(talking_frame)
                 self._is_talking = True

diff --git a/examples/simple-chatbot/requirements.txt b/examples/simple-chatbot/requirements.txt
@@ -1,4 +1,4 @@
 python-dotenv
 fastapi[all]
 uvicorn
-pipecat-ai[daily,openai,silero]
+pipecat-ai[daily,elevenlabs,openai,silero]
diff --git a/examples/storytelling-chatbot/requirements.txt b/examples/storytelling-chatbot/requirements.txt
@@ -2,4 +2,4 @@ async_timeout
 fastapi
 uvicorn
 python-dotenv
-pipecat-ai[daily,openai,fal]
+pipecat-ai[daily,elevenlabs,openai,fal]
diff --git a/examples/storytelling-chatbot/src/utils/helpers.py b/examples/storytelling-chatbot/src/utils/helpers.py
@@ -2,7 +2,7 @@
 import wave
 from PIL import Image
 
-from pipecat.frames.frames import AudioRawFrame, ImageRawFrame
+from pipecat.frames.frames import OutputAudioRawFrame, OutputImageRawFrame
 
 script_dir = os.path.dirname(__file__)
 
@@ -16,7 +16,8 @@ def load_images(image_files):
         filename = os.path.splitext(os.path.basename(full_path))[0]
         # Open the image and convert it to bytes
         with Image.open(full_path) as img:
-            images[filename] = ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)
+            images[filename] = OutputImageRawFrame(
+                image=img.tobytes(), size=img.size, format=img.format)
     return images
 
 
@@ -30,8 +31,8 @@ def load_sounds(sound_files):
         filename = os.path.splitext(os.path.basename(full_path))[0]
         # Open the sound and convert it to bytes
         with wave.open(full_path) as audio_file:
-            sounds[filename] = AudioRawFrame(audio=audio_file.readframes(-1),
-                                             sample_rate=audio_file.getframerate(),
-                                             num_channels=audio_file.getnchannels())
+            sounds[filename] = OutputAudioRawFrame(audio=audio_file.readframes(-1),
+                                                   sample_rate=audio_file.getframerate(),
+                                                   num_channels=audio_file.getnchannels())
 
     return sounds
diff --git a/examples/twilio-chatbot/README.md b/examples/twilio-chatbot/README.md
@@ -55,7 +55,7 @@ This project is a FastAPI-based chatbot that integrates with Twilio to handle We
 2. **Update the Twilio Webhook**:
     Copy the ngrok URL and update your Twilio phone number webhook URL to `http://<ngrok_url>/start_call`.
 
-3. **Update the streams.xml**:
+3. **Update streams.xml**:
     Copy the ngrok URL and update templates/streams.xml with `wss://<ngrok_url>/ws`.
 
 ## Running the Application