introduce input/output audio and image frames

pipecat-ai · Sep 20, 2024 · 116e690 · 116e690
1 parent ed409d0
commit 116e690
Show file tree

Hide file tree

Showing 47 changed files with 416 additions and 275 deletions.
diff --git a/examples/dialin-chatbot/requirements.txt b/examples/dialin-chatbot/requirements.txt
@@ -1,4 +1,4 @@
-pipecat-ai[daily,openai,silero]
+pipecat-ai[daily,elevenlabs,openai,silero]
 fastapi
 uvicorn
 python-dotenv

diff --git a/examples/foundational/05a-local-sync-speech-and-image.py b/examples/foundational/05a-local-sync-speech-and-image.py
@@ -11,7 +11,13 @@
 
 import tkinter as tk
 
-from pipecat.frames.frames import AudioRawFrame, Frame, URLImageRawFrame, LLMMessagesFrame, TextFrame
+from pipecat.frames.frames import (
+    Frame,
+    OutputAudioRawFrame,
+    TTSAudioRawFrame,
+    URLImageRawFrame,
+    LLMMessagesFrame,
+    TextFrame)
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline
@@ -65,9 +71,9 @@ def __init__(self):
                 async def process_frame(self, frame: Frame, direction: FrameDirection):
                     await super().process_frame(frame, direction)
 
-                    if isinstance(frame, AudioRawFrame):
+                    if isinstance(frame, TTSAudioRawFrame):
                         self.audio.extend(frame.audio)
-                        self.frame = AudioRawFrame(
+                        self.frame = OutputAudioRawFrame(
                             bytes(self.audio), frame.sample_rate, frame.num_channels)
 
             class ImageGrabber(FrameProcessor):

diff --git a/examples/foundational/06a-image-sync.py b/examples/foundational/06a-image-sync.py
@@ -11,7 +11,7 @@
 
 from PIL import Image
 
-from pipecat.frames.frames import ImageRawFrame, Frame, SystemFrame, TextFrame
+from pipecat.frames.frames import Frame, OutputImageRawFrame, SystemFrame, TextFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
@@ -52,9 +52,16 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
         if not isinstance(frame, SystemFrame) and direction == FrameDirection.DOWNSTREAM:
-            await self.push_frame(ImageRawFrame(image=self._speaking_image_bytes, size=(1024, 1024), format=self._speaking_image_format))
+            await self.push_frame(OutputImageRawFrame(
+                image=self._speaking_image_bytes,
+                size=(1024, 1024),
+                format=self._speaking_image_format)
+            )
             await self.push_frame(frame)
-            await self.push_frame(ImageRawFrame(image=self._waiting_image_bytes, size=(1024, 1024), format=self._waiting_image_format))
+            await self.push_frame(OutputImageRawFrame(
+                image=self._waiting_image_bytes,
+                size=(1024, 1024),
+                format=self._waiting_image_format))
         else:
             await self.push_frame(frame)
 

diff --git a/examples/foundational/09-mirror.py b/examples/foundational/09-mirror.py
@@ -8,9 +8,11 @@
 import asyncio
 import sys
 
+from pipecat.frames.frames import Frame, InputAudioRawFrame, InputImageRawFrame, OutputAudioRawFrame, OutputImageRawFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transports.services.daily import DailyTransport, DailyParams
 
 from runner import configure
@@ -24,6 +26,27 @@
 logger.add(sys.stderr, level="DEBUG")
 
 
+class MirrorProcessor(FrameProcessor):
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, InputAudioRawFrame):
+            await self.push_frame(OutputAudioRawFrame(
+                audio=frame.audio,
+                sample_rate=frame.sample_rate,
+                num_channels=frame.num_channels)
+            )
+        elif isinstance(frame, InputImageRawFrame):
+            await self.push_frame(OutputImageRawFrame(
+                image=frame.image,
+                size=frame.size,
+                format=frame.format)
+            )
+        else:
+            await self.push_frame(frame, direction)
+
+
 async def main():
     async with aiohttp.ClientSession() as session:
         (room_url, token) = await configure(session)
@@ -44,7 +67,7 @@ async def main():
         async def on_first_participant_joined(transport, participant):
             transport.capture_participant_video(participant["id"])
 
-        pipeline = Pipeline([transport.input(), transport.output()])
+        pipeline = Pipeline([transport.input(), MirrorProcessor(), transport.output()])
 
         runner = PipelineRunner()
 

diff --git a/examples/foundational/09a-local-mirror.py b/examples/foundational/09a-local-mirror.py
@@ -10,9 +10,11 @@
 
 import tkinter as tk
 
+from pipecat.frames.frames import Frame, InputAudioRawFrame, InputImageRawFrame, OutputAudioRawFrame, OutputImageRawFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.local.tk import TkLocalTransport
 from pipecat.transports.services.daily import DailyParams, DailyTransport
@@ -27,6 +29,25 @@
 logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")
 
+class MirrorProcessor(FrameProcessor):
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, InputAudioRawFrame):
+            await self.push_frame(OutputAudioRawFrame(
+                audio=frame.audio,
+                sample_rate=frame.sample_rate,
+                num_channels=frame.num_channels)
+            )
+        elif isinstance(frame, InputImageRawFrame):
+            await self.push_frame(OutputImageRawFrame(
+                image=frame.image,
+                size=frame.size,
+                format=frame.format)
+            )
+        else:
+            await self.push_frame(frame, direction)
 
 async def main():
     async with aiohttp.ClientSession() as session:
@@ -52,7 +73,7 @@ async def main():
         async def on_first_participant_joined(transport, participant):
             transport.capture_participant_video(participant["id"])
 
-        pipeline = Pipeline([daily_transport.input(), tk_transport.output()])
+        pipeline = Pipeline([daily_transport.input(), MirrorProcessor(), tk_transport.output()])
 
         task = PipelineTask(pipeline)
 

diff --git a/examples/foundational/11-sound-effects.py b/examples/foundational/11-sound-effects.py
@@ -12,9 +12,9 @@
 
 from pipecat.frames.frames import (
     Frame,
-    AudioRawFrame,
     LLMFullResponseEndFrame,
     LLMMessagesFrame,
+    OutputAudioRawFrame,
 )
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
@@ -53,8 +53,8 @@
     filename = os.path.splitext(os.path.basename(full_path))[0]
     # Open the image and convert it to bytes
     with wave.open(full_path) as audio_file:
-        sounds[file] = AudioRawFrame(audio_file.readframes(-1),
-                                     audio_file.getframerate(), audio_file.getnchannels())
+        sounds[file] = OutputAudioRawFrame(audio_file.readframes(-1),
+                                           audio_file.getframerate(), audio_file.getnchannels())
 
 
 class OutboundSoundEffectWrapper(FrameProcessor):

diff --git a/examples/moondream-chatbot/bot.py b/examples/moondream-chatbot/bot.py
@@ -13,10 +13,11 @@
 
 from pipecat.frames.frames import (
     ImageRawFrame,
+    OutputImageRawFrame,
     SpriteFrame,
     Frame,
     LLMMessagesFrame,
-    AudioRawFrame,
+    TTSAudioRawFrame,
     TTSStoppedFrame,
     TextFrame,
     UserImageRawFrame,
@@ -59,7 +60,11 @@
     # Get the filename without the extension to use as the dictionary key
     # Open the image and convert it to bytes
     with Image.open(full_path) as img:
-        sprites.append(ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
+        sprites.append(OutputImageRawFrame(
+            image=img.tobytes(),
+            size=img.size,
+            format=img.format)
+        )
 
 flipped = sprites[::-1]
 sprites.extend(flipped)
@@ -82,7 +87,7 @@ def __init__(self):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
-        if isinstance(frame, AudioRawFrame):
+        if isinstance(frame, TTSAudioRawFrame):
             if not self._is_talking:
                 await self.push_frame(talking_frame)
                 self._is_talking = True

diff --git a/examples/moondream-chatbot/requirements.txt b/examples/moondream-chatbot/requirements.txt
@@ -1,4 +1,4 @@
 python-dotenv
 fastapi[all]
 uvicorn
-pipecat-ai[daily,moondream,openai,silero]
+pipecat-ai[daily,cartesia,moondream,openai,silero]
diff --git a/examples/patient-intake/bot.py b/examples/patient-intake/bot.py
@@ -10,7 +10,7 @@
 import sys
 import wave
 
-from pipecat.frames.frames import AudioRawFrame
+from pipecat.frames.frames import OutputAudioRawFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -49,8 +49,9 @@
     filename = os.path.splitext(os.path.basename(full_path))[0]
     # Open the sound and convert it to bytes
     with wave.open(full_path) as audio_file:
-        sounds[file] = AudioRawFrame(audio_file.readframes(-1),
-                                     audio_file.getframerate(), audio_file.getnchannels())
+        sounds[file] = OutputAudioRawFrame(audio_file.readframes(-1),
+                                           audio_file.getframerate(),
+                                           audio_file.getnchannels())
 
 
 class IntakeProcessor:

diff --git a/examples/patient-intake/requirements.txt b/examples/patient-intake/requirements.txt
@@ -1,4 +1,4 @@
 python-dotenv
 fastapi[all]
 uvicorn
-pipecat-ai[daily,openai,silero]
+pipecat-ai[daily,cartesia,openai,silero]
diff --git a/examples/simple-chatbot/bot.py b/examples/simple-chatbot/bot.py
@@ -16,11 +16,11 @@
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
 from pipecat.frames.frames import (
-    AudioRawFrame,
-    ImageRawFrame,
+    OutputImageRawFrame,
     SpriteFrame,
     Frame,
     LLMMessagesFrame,
+    TTSAudioRawFrame,
     TTSStoppedFrame
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -49,7 +49,11 @@
     # Get the filename without the extension to use as the dictionary key
     # Open the image and convert it to bytes
     with Image.open(full_path) as img:
-        sprites.append(ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
+        sprites.append(OutputImageRawFrame(
+            image=img.tobytes(),
+            size=img.size,
+            format=img.format)
+        )
 
 flipped = sprites[::-1]
 sprites.extend(flipped)
@@ -72,7 +76,7 @@ def __init__(self):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
-        if isinstance(frame, AudioRawFrame):
+        if isinstance(frame, TTSAudioRawFrame):
             if not self._is_talking:
                 await self.push_frame(talking_frame)
                 self._is_talking = True

diff --git a/examples/simple-chatbot/requirements.txt b/examples/simple-chatbot/requirements.txt
@@ -1,4 +1,4 @@
 python-dotenv
 fastapi[all]
 uvicorn
-pipecat-ai[daily,openai,silero]
+pipecat-ai[daily,elevenlabs,openai,silero]
diff --git a/examples/storytelling-chatbot/requirements.txt b/examples/storytelling-chatbot/requirements.txt
@@ -2,4 +2,4 @@ async_timeout
 fastapi
 uvicorn
 python-dotenv
-pipecat-ai[daily,openai,fal]
+pipecat-ai[daily,elevenlabs,openai,fal]
diff --git a/examples/storytelling-chatbot/src/utils/helpers.py b/examples/storytelling-chatbot/src/utils/helpers.py
@@ -2,7 +2,7 @@
 import wave
 from PIL import Image
 
-from pipecat.frames.frames import AudioRawFrame, ImageRawFrame
+from pipecat.frames.frames import OutputAudioRawFrame, OutputImageRawFrame
 
 script_dir = os.path.dirname(__file__)
 
@@ -16,7 +16,8 @@ def load_images(image_files):
         filename = os.path.splitext(os.path.basename(full_path))[0]
         # Open the image and convert it to bytes
         with Image.open(full_path) as img:
-            images[filename] = ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)
+            images[filename] = OutputImageRawFrame(
+                image=img.tobytes(), size=img.size, format=img.format)
     return images
 
 
@@ -30,8 +31,8 @@ def load_sounds(sound_files):
         filename = os.path.splitext(os.path.basename(full_path))[0]
         # Open the sound and convert it to bytes
         with wave.open(full_path) as audio_file:
-            sounds[filename] = AudioRawFrame(audio=audio_file.readframes(-1),
-                                             sample_rate=audio_file.getframerate(),
-                                             num_channels=audio_file.getnchannels())
+            sounds[filename] = OutputAudioRawFrame(audio=audio_file.readframes(-1),
+                                                   sample_rate=audio_file.getframerate(),
+                                                   num_channels=audio_file.getnchannels())
 
     return sounds
diff --git a/examples/twilio-chatbot/README.md b/examples/twilio-chatbot/README.md
@@ -55,7 +55,7 @@ This project is a FastAPI-based chatbot that integrates with Twilio to handle We
 2. **Update the Twilio Webhook**:
     Copy the ngrok URL and update your Twilio phone number webhook URL to `http://<ngrok_url>/start_call`.
 
-3. **Update the streams.xml**:
+3. **Update streams.xml**:
     Copy the ngrok URL and update templates/streams.xml with `wss://<ngrok_url>/ws`.
 
 ## Running the Application