pipecat-ai · aconchillo · Apr 6, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/dot-env.template b/dot-env.template
@@ -2,8 +2,16 @@
 ANTHROPIC_API_KEY=...
 
 # Azure
-SPEECH_KEY=...
-SPEECH_REGION=...
+AZURE_SPEECH_REGION=...
+AZURE_SPEECH_API_KEY=...
+
+AZURE_CHATGPT_API_KEY=...
+AZURE_CHATGPT_ENDPOINT=https://...
+AZURE_CHATGPT_MODEL=...
+
+AZURE_DALLE_API_KEY=...
+AZURE_DALLE_ENDPOINT=https://...
+AZURE_DALLE_MODEL=...
 
 # Daily
 DAILY_API_KEY=...

diff --git a/...updated/05a-local-sync-speech-and-text.py → ...ational/05a-local-sync-speech-and-text.py b/...updated/05a-local-sync-speech-and-text.py → ...ational/05a-local-sync-speech-and-text.py
@@ -3,8 +3,9 @@
 import logging
 import tkinter as tk
 import os
+from dailyai.pipeline.aggregators import LLMFullResponseAggregator
 
-from dailyai.pipeline.frames import AudioFrame, ImageFrame
+from dailyai.pipeline.frames import AudioFrame, ImageFrame, LLMMessagesFrame, TextFrame
 from dailyai.services.open_ai_services import OpenAILLMService
 from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
 from dailyai.services.fal_ai_services import FalImageGenService
@@ -22,7 +23,7 @@ async def main():
     async with aiohttp.ClientSession() as session:
         meeting_duration_minutes = 5
         tk_root = tk.Tk()
-        tk_root.title("Calendar")
+        tk_root.title("dailyai")
 
         transport = LocalTransport(
             mic_enabled=True,
@@ -43,7 +44,7 @@ async def main():
             api_key=os.getenv("OPENAI_API_KEY"),
             model="gpt-4-turbo-preview")
 
-        dalle = FalImageGenService(
+        imagegen = FalImageGenService(
             image_size="1024x1024",
             aiohttp_session=session,
             key_id=os.getenv("FAL_KEY_ID"),
@@ -60,18 +61,33 @@ async def get_all_audio(text):
 
             return all_audio
 
+        async def get_month_description(aggregator, frame):
+            async for frame in aggregator.process_frame(frame):
+                if isinstance(frame, TextFrame):
+                    return frame.text
+
         async def get_month_data(month):
             messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {
                 month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
 
-            image_description = await llm.run_llm(messages)
+            messages_frame = LLMMessagesFrame(messages)
+
+            llm_full_response_aggregator = LLMFullResponseAggregator()
+
+            image_description = None
+            async for frame in llm.process_frame(messages_frame):
+                result = await get_month_description(llm_full_response_aggregator, frame)
+                if result:
+                    image_description = result
+                    break
+
             if not image_description:
                 return
 
             to_speak = f"{month}: {image_description}"
             audio_task = asyncio.create_task(get_all_audio(to_speak))
             image_task = asyncio.create_task(
-                dalle.run_image_gen(image_description))
+                imagegen.run_image_gen(image_description))
             (audio, image_data) = await asyncio.gather(audio_task, image_task)
 
             return {
@@ -82,19 +98,14 @@ async def get_month_data(month):
                 "audio": audio,
             }
 
+        # We only specify 5 months as we create tasks all at once and we might
+        # get rate limited otherwise.
         months: list[str] = [
             "January",
             "February",
             "March",
             "April",
             "May",
-            "June",
-            "July",
-            "August",
-            "September",
-            "October",
-            "November",
-            "December",
         ]
 
         async def show_images():

diff --git a/...ndational/to_be_updated/06a-image-sync.py → examples/foundational/06a-image-sync.py b/...ndational/to_be_updated/06a-image-sync.py → examples/foundational/06a-image-sync.py
@@ -5,7 +5,8 @@
 import aiohttp
 from PIL import Image
 
-from dailyai.pipeline.frames import ImageFrame, Frame
+from dailyai.pipeline.frames import ImageFrame, Frame, TextFrame
+from dailyai.pipeline.pipeline import Pipeline
 from dailyai.transports.daily_transport import DailyTransport
 from dailyai.services.ai_services import AIService
 from dailyai.pipeline.aggregators import (
@@ -14,7 +15,6 @@
 )
 from dailyai.services.open_ai_services import OpenAILLMService
 from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
-from dailyai.services.fal_ai_services import FalImageGenService
 
 from runner import configure
 
@@ -53,6 +53,7 @@ async def main(room_url: str, token):
         transport._camera_height = 1024
         transport._mic_enabled = True
         transport._mic_sample_rate = 16000
+        transport.transcription_settings["extra"]["punctuate"] = True
 
         tts = ElevenLabsTTSService(
             aiohttp_session=session,
@@ -64,57 +65,30 @@ async def main(room_url: str, token):
             api_key=os.getenv("OPENAI_API_KEY"),
             model="gpt-4-turbo-preview")
 
-        img = FalImageGenService(
-            image_size="1024x1024",
-            aiohttp_session=session,
-            key_id=os.getenv("FAL_KEY_ID"),
-            key_secret=os.getenv("FAL_KEY_SECRET"),
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so it should not include any special characters. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        tma_in = LLMUserContextAggregator(
+            messages, transport._my_participant_id)
+        tma_out = LLMAssistantContextAggregator(
+            messages, transport._my_participant_id
+        )
+        image_sync_aggregator = ImageSyncAggregator(
+            os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
+            os.path.join(os.path.dirname(__file__), "assets", "waiting.png"),
         )
 
-        async def get_images():
-            get_speaking_task = asyncio.create_task(
-                img.run_image_gen("An image of a cat speaking")
-            )
-            get_waiting_task = asyncio.create_task(
-                img.run_image_gen("An image of a cat waiting")
-            )
-
-            (speaking_data, waiting_data) = await asyncio.gather(
-                get_speaking_task, get_waiting_task
-            )
-
-            return speaking_data, waiting_data
+        pipeline = Pipeline([image_sync_aggregator, tma_in, llm, tma_out, tts])
 
         @transport.event_handler("on_first_other_participant_joined")
         async def on_first_other_participant_joined(transport):
-            await tts.say("Hi, I'm listening!", transport.send_queue)
-
-        async def handle_transcriptions():
-            messages = [
-                {
-                    "role": "system",
-                    "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
-                },
-            ]
-
-            tma_in = LLMUserContextAggregator(
-                messages, transport._my_participant_id)
-            tma_out = LLMAssistantContextAggregator(
-                messages, transport._my_participant_id
-            )
-            image_sync_aggregator = ImageSyncAggregator(
-                os.path.join(
-                    os.path.dirname(__file__), "assets", "speaking.png"), os.path.join(
-                    os.path.dirname(__file__), "assets", "waiting.png"), )
-            await tts.run_to_queue(
-                transport.send_queue,
-                image_sync_aggregator.run(
-                    tma_out.run(llm.run(tma_in.run(transport.get_receive_frames())))
-                ),
-            )
+            await pipeline.queue_frames([TextFrame("Hi, I'm listening!")])
 
-        transport.transcription_settings["extra"]["punctuate"] = True
-        await asyncio.gather(transport.run(), handle_transcriptions())
+        await transport.run(pipeline)
 
 
 if __name__ == "__main__":

diff --git a/...oundational/to_be_updated/10-wake-word.py → examples/foundational/10-wake-word.py b/...oundational/to_be_updated/10-wake-word.py → examples/foundational/10-wake-word.py
@@ -5,6 +5,7 @@
 import random
 from typing import AsyncGenerator
 from PIL import Image
+from dailyai.pipeline.pipeline import Pipeline
 
 from dailyai.transports.daily_transport import DailyTransport
 from dailyai.services.open_ai_services import OpenAILLMService
@@ -133,6 +134,7 @@ async def main(room_url: str, token):
         transport._camera_enabled = True
         transport._camera_width = 720
         transport._camera_height = 1280
+        transport.transcription_settings["extra"]["punctuate"] = True
 
         llm = OpenAILLMService(
             api_key=os.getenv("OPENAI_API_KEY"),
@@ -145,45 +147,34 @@ async def main(room_url: str, token):
         )
         isa = ImageSyncAggregator()
 
+        messages = [
+            {
+                "role": "system",
+                "content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while.  Your responses should only be a few sentences long.",
+            },
+        ]
+
+        tma_in = LLMUserContextAggregator(
+            messages, transport._my_participant_id)
+        tma_out = LLMAssistantContextAggregator(
+            messages, transport._my_participant_id
+        )
+        tf = TranscriptFilter(transport._my_participant_id)
+        ncf = NameCheckFilter(["Santa Cat", "Santa"])
+
+        pipeline = Pipeline([isa, tf, ncf, tma_in, llm, tma_out, tts])
+
         @transport.event_handler("on_first_other_participant_joined")
         async def on_first_other_participant_joined(transport):
-            await tts.say(
+            await transport.say(
                 "Hi! If you want to talk to me, just say 'hey Santa Cat'.",
-                transport.send_queue,
-            )
-
-        async def handle_transcriptions():
-            messages = [
-                {
-                    "role": "system",
-                    "content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while.  Your responses should only be a few sentences long.",
-                },
-            ]
-
-            tma_in = LLMUserContextAggregator(
-                messages, transport._my_participant_id)
-            tma_out = LLMAssistantContextAggregator(
-                messages, transport._my_participant_id
-            )
-            tf = TranscriptFilter(transport._my_participant_id)
-            ncf = NameCheckFilter(["Santa Cat", "Santa"])
-            await tts.run_to_queue(
-                transport.send_queue,
-                isa.run(
-                    tma_out.run(
-                        llm.run(
-                            tma_in.run(
-                                ncf.run(tf.run(transport.get_receive_frames())))
-                        )
-                    )
-                ),
+                tts,
             )
 
         async def starting_image():
             await transport.send_queue.put(quiet_frame)
 
-        transport.transcription_settings["extra"]["punctuate"] = True
-        await asyncio.gather(transport.run(), handle_transcriptions(), starting_image())
+        await asyncio.gather(transport.run(pipeline), starting_image())
 
 
 if __name__ == "__main__":

diff --git a/...ational/to_be_updated/11-sound-effects.py → examples/foundational/11-sound-effects.py b/...ational/to_be_updated/11-sound-effects.py → examples/foundational/11-sound-effects.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import wave
+from dailyai.pipeline.pipeline import Pipeline
 
 from dailyai.transports.daily_transport import DailyTransport
 from dailyai.services.open_ai_services import OpenAILLMService
@@ -81,6 +82,7 @@ async def main(room_url: str, token):
             mic_sample_rate=16000,
             camera_enabled=False,
         )
+        transport.transcription_settings["extra"]["punctuate"] = True
 
         llm = OpenAILLMService(
             api_key=os.getenv("OPENAI_API_KEY"),
@@ -92,47 +94,31 @@ async def main(room_url: str, token):
             voice_id="ErXwobaYiN019PkySvjV",
         )
 
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        tma_in = LLMUserContextAggregator(
+            messages, transport._my_participant_id)
+        tma_out = LLMAssistantContextAggregator(
+            messages, transport._my_participant_id
+        )
+        out_sound = OutboundSoundEffectWrapper()
+        in_sound = InboundSoundEffectWrapper()
+        fl = FrameLogger("LLM Out")
+        fl2 = FrameLogger("Transcription In")
+
+        pipeline = Pipeline([tma_in, in_sound, fl2, llm, tma_out, fl, tts, out_sound])
+
         @transport.event_handler("on_first_other_participant_joined")
         async def on_first_other_participant_joined(transport):
-            await tts.say("Hi, I'm listening!", transport.send_queue)
+            await transport.say("Hi, I'm listening!", tts)
             await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))
 
-        async def handle_transcriptions():
-            messages = [
-                {
-                    "role": "system",
-                    "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
-                },
-            ]
-
-            tma_in = LLMUserContextAggregator(
-                messages, transport._my_participant_id)
-            tma_out = LLMAssistantContextAggregator(
-                messages, transport._my_participant_id
-            )
-            out_sound = OutboundSoundEffectWrapper()
-            in_sound = InboundSoundEffectWrapper()
-            fl = FrameLogger("LLM Out")
-            fl2 = FrameLogger("Transcription In")
-            await out_sound.run_to_queue(
-                transport.send_queue,
-                tts.run(
-                    fl.run(
-                        tma_out.run(
-                            llm.run(
-                                fl2.run(
-                                    in_sound.run(
-                                        tma_in.run(transport.get_receive_frames())
-                                    )
-                                )
-                            )
-                        )
-                    )
-                ),
-            )
-
-        transport.transcription_settings["extra"]["punctuate"] = True
-        await asyncio.gather(transport.run(), handle_transcriptions())
+        await asyncio.gather(transport.run(pipeline))
 
 
 if __name__ == "__main__":

diff --git a/src/dailyai/services/azure_ai_services.py b/src/dailyai/services/azure_ai_services.py
@@ -19,7 +19,7 @@
 except ModuleNotFoundError as e:
     print(f"Exception: {e}")
     print(
-        "In order to use Azure TTS, you need to `pip install dailyai[azure]`. Also, set `SPEECH_KEY` and `SPEECH_REGION` environment variables.")
+        "In order to use Azure TTS, you need to `pip install dailyai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.")
     raise Exception(f"Missing module: {e}")
 
 from dailyai.services.openai_api_llm_service import BaseOpenAILLMService