From 7c41246e55ada40538dce8b7c09b8d32b6fb0252 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 14 May 2024 00:32:37 -0700
Subject: [PATCH] examples: fix storytelling example

---
 examples/storytelling-chatbot/env.example     |  11 +-
 .../storytelling-chatbot/requirements.txt     |   4 +-
 examples/storytelling-chatbot/src/bot.py      | 143 ++++++++----------
 .../storytelling-chatbot/src/processors.py    |  54 +++----
 examples/storytelling-chatbot/src/prompts.py  |   2 +-
 examples/storytelling-chatbot/src/server.py   |   2 +-
 .../src/utils/daily_helpers.py                |   2 +-
 .../storytelling-chatbot/src/utils/helpers.py |   8 +-
 8 files changed, 104 insertions(+), 122 deletions(-)

diff --git a/examples/storytelling-chatbot/env.example b/examples/storytelling-chatbot/env.example
index e32c9fe23..56b121a4d 100644
--- a/examples/storytelling-chatbot/env.example
+++ b/examples/storytelling-chatbot/env.example
@@ -1,6 +1,5 @@
-ELEVENLABS_API_KEY=
-ELEVENLABS_VOICE_ID=
-FAL_KEY=
-DAILY_API_URL=api.daily.co/v1
-DAILY_API_KEY=
-OPENAI_API_KEY=
\ No newline at end of file
+DAILY_API_KEY=7df...
+ELEVENLABS_API_KEY=aeb...
+ELEVENLABS_VOICE_ID=7S...
+FAL_KEY=8c...
+OPENAI_API_KEY=sk-PL...
diff --git a/examples/storytelling-chatbot/requirements.txt b/examples/storytelling-chatbot/requirements.txt
index 7b66f55dd..78cebe200 100644
--- a/examples/storytelling-chatbot/requirements.txt
+++ b/examples/storytelling-chatbot/requirements.txt
@@ -1,5 +1,5 @@
-dailyai[daily,openai,fal]==0.0.8
 fastapi
 uvicorn
 requests
-python-dotenv
\ No newline at end of file
+python-dotenv
+pipecat-ai[daily,openai,fal]
diff --git a/examples/storytelling-chatbot/src/bot.py b/examples/storytelling-chatbot/src/bot.py
index b9d328e60..1769a50c3 100644
--- a/examples/storytelling-chatbot/src/bot.py
+++ b/examples/storytelling-chatbot/src/bot.py
@@ -1,37 +1,32 @@
+import argparse
 import asyncio
 import aiohttp
-import logging
 import os
-import argparse
+import sys
 
-from dailyai.pipeline.pipeline import Pipeline
-from dailyai.pipeline.frames import (
-    AudioFrame,
-    ImageFrame,
-    EndPipeFrame,
-    LLMMessagesFrame,
-    SendAppMessageFrame
-)
-from dailyai.pipeline.aggregators import (
-    LLMUserResponseAggregator,
-    LLMAssistantResponseAggregator,
-)
-from dailyai.transports.daily_transport import DailyTransport
-from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
-from dailyai.services.open_ai_services import OpenAILLMService
-from dailyai.services.fal_ai_services import FalImageGenService
 
+from pipecat.frames.frames import LLMMessagesFrame, StopTaskFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineTask
+from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
+from pipecat.services.elevenlabs import ElevenLabsTTSService
+from pipecat.services.fal import FalImageGenService
+from pipecat.services.openai import OpenAILLMService
+from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyTransportMessageFrame
+
+from pipecat.vad.silero import SileroVAD
 from processors import StoryProcessor, StoryImageProcessor
 from prompts import LLM_BASE_PROMPT, LLM_INTRO_PROMPT, CUE_USER_TURN
 from utils.helpers import load_sounds, load_images
 
+from loguru import logger
+
 from dotenv import load_dotenv
 load_dotenv(override=True)
 
-logging.basicConfig(format=f"[STORYBOT] %(levelno)s %(asctime)s %(message)s")
-logger = logging.getLogger("dailyai")
-logger.setLevel(logging.INFO)
-
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
 
 sounds = load_sounds(["listening.wav"])
 images = load_images(["book1.png", "book2.png"])
@@ -46,22 +41,23 @@ async def main(room_url, token=None):
             room_url,
             token,
             "Storytelling Bot",
-            duration_minutes=5,
-            start_transcription=True,
-            mic_enabled=True,
-            mic_sample_rate=16000,
-            vad_enabled=True,
-            camera_framerate=30,
-            camera_bitrate=680000,
-            camera_enabled=True,
-            camera_width=768,
-            camera_height=768,
+            DailyParams(
+                audio_in_enabled=True,
+                audio_out_enabled=True,
+                camera_out_enabled=True,
+                camera_out_width=768,
+                camera_out_height=768,
+                transcription_enabled=True,
+                vad_enabled=True,
+            )
         )
 
         logger.debug("Transport created for room:" + room_url)
 
         # -------------- Services --------------- #
 
+#        vad = SileroVAD()
+
         llm_service = OpenAILLMService(
             api_key=os.getenv("OPENAI_API_KEY"),
             model="gpt-4-turbo"
@@ -103,68 +99,55 @@ async def main(room_url, token=None):
 
         # -------------- Story Loop ------------- #
 
-        logger.debug("Waiting for participant...")
+        runner = PipelineRunner()
 
-        start_storytime_event = asyncio.Event()
+        # The intro pipeline is used to start
+        # the story (as per LLM_INTRO_PROMPT)
+        intro_pipeline = Pipeline([llm_service, tts_service, transport.output()])
 
-        @transport.event_handler("on_first_other_participant_joined")
-        async def on_first_other_participant_joined(transport, participant):
-            logger.debug("Participant joined, storytime commence!")
-            start_storytime_event.set()
-
-        # The storytime coroutine will wait for the start_storytime_event
-        # to be set before starting the storytime pipeline
-        async def storytime():
-            await start_storytime_event.wait()
+        intro_task = PipelineTask(intro_pipeline)
 
-            # The intro pipeline is used to start
-            # the story (as per LLM_INTRO_PROMPT)
-            intro_pipeline = Pipeline(processors=[
-                llm_service,
-                tts_service,
-            ], sink=transport.send_queue)
+        logger.debug("Waiting for participant...")
 
-            await intro_pipeline.queue_frames(
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            logger.debug("Participant joined, storytime commence!")
+            transport.capture_participant_transcription(participant["id"])
+            await intro_task.queue_frames(
                 [
-                    ImageFrame(images['book1'], (768, 768)),
+                    images['book1'],
                     LLMMessagesFrame([LLM_INTRO_PROMPT]),
-                    SendAppMessageFrame(CUE_USER_TURN, None),
-                    AudioFrame(sounds["listening"]),
-                    ImageFrame(images['book2'], (768, 768)),
-                    EndPipeFrame(),
+                    DailyTransportMessageFrame(CUE_USER_TURN),
+                    sounds["listening"],
+                    images['book2'],
+                    StopTaskFrame()
                 ]
             )
 
-            # We start the pipeline as soon as the user joins
-            await intro_pipeline.run_pipeline()
-
-            # The main story pipeline is used to continue the
-            # story based on user input
-            pipeline = Pipeline(processors=[
-                user_responses,
-                llm_service,
-                story_processor,
-                image_processor,
-                tts_service,
-                llm_responses,
-            ])
-
-            await transport.run_pipeline(pipeline)
-
-        transport.transcription_settings["extra"]["endpointing"] = True
-        transport.transcription_settings["extra"]["punctuate"] = True
+        # We run the intro pipeline. This will start the transport. The intro
+        # task will exit after StopTaskFrame is processed.
+        await runner.run(intro_task)
 
-        try:
-            await asyncio.gather(transport.run(), storytime())
-        except (asyncio.CancelledError, KeyboardInterrupt):
-            transport.stop()
+        # The main story pipeline is used to continue the story based on user
+        # input.
+        main_pipeline = Pipeline([
+            transport.input(),
+            #            vad,
+            user_responses,
+            llm_service,
+            story_processor,
+            image_processor,
+            tts_service,
+            llm_responses,
+            transport.output()
+        ])
 
-        logger.debug("Pipeline finished. Exiting.")
+        main_task = PipelineTask(main_pipeline)
 
+        await runner.run(main_task)
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Daily Storyteller Bot")
+    parser = argparse.ArgumentParser(description="Daily Storyteller Bot")
     parser.add_argument("-u", type=str, help="Room URL")
     parser.add_argument("-t", type=str, help="Token")
     config = parser.parse_args()
diff --git a/examples/storytelling-chatbot/src/processors.py b/examples/storytelling-chatbot/src/processors.py
index c056355e4..30528af94 100644
--- a/examples/storytelling-chatbot/src/processors.py
+++ b/examples/storytelling-chatbot/src/processors.py
@@ -1,19 +1,13 @@
-from typing import AsyncGenerator
 import re
 
-from dailyai.pipeline.frames import TextFrame, Frame, AudioFrame
-from dailyai.pipeline.frame_processor import FrameProcessor
-from dailyai.pipeline.frames import (
-    Frame,
-    TextFrame,
-    SendAppMessageFrame,
-    LLMResponseEndFrame,
-    UserStoppedSpeakingFrame,
-)
+from async_timeout import timeout
+
+from pipecat.frames.frames import Frame, LLMResponseEndFrame, TextFrame, UserStoppedSpeakingFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.transports.services.daily import DailyTransportMessageFrame
 
 from utils.helpers import load_sounds
 from prompts import IMAGE_GEN_PROMPT, CUE_USER_TURN, CUE_ASSISTANT_TURN
-import asyncio
 
 sounds = load_sounds(["talking.wav", "listening.wav", "ding.wav"])
 
@@ -42,7 +36,7 @@ class StoryImageProcessor(FrameProcessor):
     Processor for image prompt frames that will be sent to the FAL service.
 
     This processor is responsible for consuming frames of type `StoryImageFrame`.
-    It processes the by passing it to the FAL service
+    It processes them by passing it to the FAL service.
     The processed frames are then yielded back.
 
     Attributes:
@@ -50,25 +44,26 @@ class StoryImageProcessor(FrameProcessor):
     """
 
     def __init__(self, fal_service):
+        super().__init__()
         self._fal_service = fal_service
 
-    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
         if isinstance(frame, StoryImageFrame):
             try:
-                async with asyncio.timeout(7):
-                    async for i in self._fal_service.process_frame(TextFrame(IMAGE_GEN_PROMPT % frame.text)):
-                        yield i
+                async with timeout(7):
+                    async for i in self._fal_service.run_image_gen(IMAGE_GEN_PROMPT % frame.text):
+                        await self.push_frame(i)
             except TimeoutError:
                 pass
             pass
         else:
-            yield frame
+            await self.push_frame(frame)
 
 
 class StoryProcessor(FrameProcessor):
     """
     Primary frame processor. It takes the frames generated by the LLM
-    and processes them into image prompts and story pages (sentences.)
+    and processes them into image prompts and story pages (sentences).
     For a clearer picture of how this works, reference prompts.py
 
     Attributes:
@@ -81,15 +76,16 @@ class StoryProcessor(FrameProcessor):
     """
 
     def __init__(self, messages, story):
+        super().__init__()
         self._messages = messages
         self._text = ""
         self._story = story
 
-    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
         if isinstance(frame, UserStoppedSpeakingFrame):
             # Send an app message to the UI
-            yield SendAppMessageFrame(CUE_ASSISTANT_TURN, None)
-            yield AudioFrame(sounds["talking"])
+            await self.push_frame(DailyTransportMessageFrame(CUE_ASSISTANT_TURN))
+            await self.push_frame(sounds["talking"])
 
         elif isinstance(frame, TextFrame):
             # We want to look for sentence breaks in the text
@@ -111,7 +107,7 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
                 # Remove the image prompt from the text
                 self._text = re.sub(r"<.*?>", '', self._text, count=1)
                 # Process the image prompt frame
-                yield StoryImageFrame(image_prompt)
+                await self.push_frame(StoryImageFrame(image_prompt))
 
             # STORY PAGE
             # Looking for: [break] in the LLM response
@@ -126,9 +122,9 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
                 if len(self._text) > 2:
                     # Append the sentence to the story
                     self._story.append(self._text)
-                    yield StoryPageFrame(self._text)
+                    await self.push_frame(StoryPageFrame(self._text))
                     # Assert that it's the LLMs turn, until we're finished
-                    yield SendAppMessageFrame(CUE_ASSISTANT_TURN, None)
+                    await self.push_frame(DailyTransportMessageFrame(CUE_ASSISTANT_TURN))
                 # Clear the buffer
                 self._text = ""
 
@@ -136,13 +132,13 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
         # Driven by the prompt, the LLM should have asked the user for input
         elif isinstance(frame, LLMResponseEndFrame):
             # We use a different frame type, as to avoid image generation ingest
-            yield StoryPromptFrame(self._text)
+            await self.push_frame(StoryPromptFrame(self._text))
             self._text = ""
-            yield frame
+            await self.push_frame(frame)
             # Send an app message to the UI
-            yield SendAppMessageFrame(CUE_USER_TURN, None)
-            yield AudioFrame(sounds["listening"])
+            await self.push_frame(DailyTransportMessageFrame(CUE_USER_TURN))
+            await self.push_frame(sounds["listening"])
 
         # Anything that is not a TextFrame pass through
         else:
-            yield frame
+            await self.push_frame(frame)
diff --git a/examples/storytelling-chatbot/src/prompts.py b/examples/storytelling-chatbot/src/prompts.py
index 9df917234..551a7c4f2 100644
--- a/examples/storytelling-chatbot/src/prompts.py
+++ b/examples/storytelling-chatbot/src/prompts.py
@@ -3,7 +3,7 @@
     "content": "You are a creative storyteller who loves to tell whimsical, fantastical stories. \
         Your goal is to craft an engaging and fun story. \
         Start by asking the user what kind of story they'd like to hear. Don't provide any examples. \
-        Keep your reponse to only a few sentences."
+        Keep your response to only a few sentences."
 }
 
 
diff --git a/examples/storytelling-chatbot/src/server.py b/examples/storytelling-chatbot/src/server.py
index 7596af2ed..89f67a15b 100644
--- a/examples/storytelling-chatbot/src/server.py
+++ b/examples/storytelling-chatbot/src/server.py
@@ -8,7 +8,7 @@
 from fastapi import FastAPI, Request, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
-from fastapi.responses import FileResponse, JSONResponse, RedirectResponse
+from fastapi.responses import FileResponse, JSONResponse
 
 from utils.daily_helpers import create_room as _create_room, get_token, get_name_from_url
 
diff --git a/examples/storytelling-chatbot/src/utils/daily_helpers.py b/examples/storytelling-chatbot/src/utils/daily_helpers.py
index 0a2b472f6..140f710e4 100644
--- a/examples/storytelling-chatbot/src/utils/daily_helpers.py
+++ b/examples/storytelling-chatbot/src/utils/daily_helpers.py
@@ -9,7 +9,7 @@
 load_dotenv()
 
 
-daily_api_path = os.getenv("DAILY_API_URL")
+daily_api_path = os.getenv("DAILY_API_URL") or "api.daily.co/v1"
 daily_api_key = os.getenv("DAILY_API_KEY")
 
 
diff --git a/examples/storytelling-chatbot/src/utils/helpers.py b/examples/storytelling-chatbot/src/utils/helpers.py
index f152be6e4..2c576fdff 100644
--- a/examples/storytelling-chatbot/src/utils/helpers.py
+++ b/examples/storytelling-chatbot/src/utils/helpers.py
@@ -2,6 +2,8 @@
 import wave
 from PIL import Image
 
+from pipecat.frames.frames import AudioRawFrame, ImageRawFrame
+
 script_dir = os.path.dirname(__file__)
 
 
@@ -14,7 +16,7 @@ def load_images(image_files):
         filename = os.path.splitext(os.path.basename(full_path))[0]
         # Open the image and convert it to bytes
         with Image.open(full_path) as img:
-            images[filename] = img.tobytes()
+            images[filename] = ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)
     return images
 
 
@@ -28,6 +30,8 @@ def load_sounds(sound_files):
         filename = os.path.splitext(os.path.basename(full_path))[0]
         # Open the sound and convert it to bytes
         with wave.open(full_path) as audio_file:
-            sounds[filename] = audio_file.readframes(-1)
+            sounds[filename] = AudioRawFrame(audio=audio_file.readframes(-1),
+                                             sample_rate=audio_file.getframerate(),
+                                             num_channels=audio_file.getnchannels())
 
     return sounds