Added sound effect example (#18)

* added sound effect example * added dialout to this branch too * fixup * fixup for more dialout testing * cleanup
pipecat-ai · Feb 1, 2024 · 0d96f91 · 0d96f91
1 parent 4e95865
commit 0d96f91
Show file tree

Hide file tree

Showing 20 changed files with 354 additions and 6 deletions.
diff --git a/src/dailyai/queue_frame.py b/src/dailyai/queue_frame.py
@@ -18,6 +18,8 @@ class StartStreamQueueFrame(ControlQueueFrame):
 class EndStreamQueueFrame(ControlQueueFrame):
     pass
 
+class LLMResponseEndQueueFrame(QueueFrame):
+    pass
 
 @dataclass()
 class AudioQueueFrame(QueueFrame):

diff --git a/src/dailyai/services/ai_services.py b/src/dailyai/services/ai_services.py
@@ -9,6 +9,7 @@
     EndStreamQueueFrame,
     ImageQueueFrame,
     LLMMessagesQueueFrame,
+    LLMResponseEndQueueFrame,
     QueueFrame,
     TextQueueFrame,
 )
@@ -89,6 +90,9 @@ async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, N
         if isinstance(frame, LLMMessagesQueueFrame):
             async for text_chunk in self.run_llm_async(frame.messages):
                 yield TextQueueFrame(text_chunk)
+            yield LLMResponseEndQueueFrame()
+        else:
+            yield frame
 
 
 class TTSService(AIService):
@@ -186,6 +190,18 @@ async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, N
         text = await self.run_stt(content)
         yield TextQueueFrame(text)
 
+class FrameLogger(AIService):
+    def __init__(self, prefix="Frame", **kwargs):
+        super().__init__(**kwargs)
+        self.prefix = prefix
+
+    async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
+        if isinstance(frame, (AudioQueueFrame, ImageQueueFrame)):
+            self.logger.info(f"{self.prefix}: {type(frame)}")
+        else:
+            print(f"{self.prefix}: {frame}")
+
+        yield frame
 
 @dataclass
 class AIServiceConfig:

diff --git a/src/dailyai/services/daily_transport_service.py b/src/dailyai/services/daily_transport_service.py
@@ -305,6 +305,12 @@ def call_joined(self, join_data, client_error):
             t = Thread(target=self._receive_audio, daemon=True)
             t.start()
 
+    def dialout(self, number):
+        self.client.start_dialout({"phoneNumber": number})
+
+    def start_recording(self):
+        self.client.start_recording()
+
     def on_error(self, error):
         self._logger.error(f"on_error: {error}")
 

diff --git a/src/samples/foundational/06a-image-sync.py b/src/samples/foundational/06a-image-sync.py
@@ -79,8 +79,8 @@ async def handle_transcriptions():
                 messages, transport.my_participant_id
             )
             image_sync_aggregator = ImageSyncAggregator(
-                os.path.join(os.path.dirname(__file__), "images", "speaking.png"),
-                os.path.join(os.path.dirname(__file__), "images", "waiting.png"),
+                os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
+                os.path.join(os.path.dirname(__file__), "assets", "waiting.png"),
             )
             await tts.run_to_queue(
                 transport.send_queue,

diff --git a/src/samples/foundational/08b-debate-generator.py b/src/samples/foundational/08b-debate-generator.py
@@ -36,9 +36,9 @@ async def main(room_url:str):
         affirmative = "A woman dressed as a cowboy, outside on a ranch"
         negative = "Pikachu in a business suit"
 
-        topic = "Is a hot dog a sandwich?"
-        affirmative = "A woman conservatively dressed as a librarian in a library surrounded by books"
-        negative = "A cat dressed in a hot dog costume"
+        # topic = "Is a hot dog a sandwich?"
+        # affirmative = "A woman conservatively dressed as a librarian in a library surrounded by books"
+        # negative = "A cat dressed in a hot dog costume"
 
 
 

diff --git a/src/samples/foundational/10-wake-word.py b/src/samples/foundational/10-wake-word.py
@@ -39,7 +39,7 @@
 
 for file in image_files:
     # Build the full path to the image file
-    full_path = os.path.join(script_dir, "images", file)
+    full_path = os.path.join(script_dir, "assets", file)
     # Get the filename without the extension to use as the dictionary key
     filename = os.path.splitext(os.path.basename(full_path))[0]
     # Open the image and convert it to bytes

diff --git a/src/samples/foundational/11-sound-effects.py b/src/samples/foundational/11-sound-effects.py
@@ -0,0 +1,159 @@
+import argparse
+import asyncio
+import logging
+import os
+import wave
+import requests
+import time
+import urllib.parse
+
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
+from dailyai.queue_aggregators import LLMContextAggregator, LLMUserContextAggregator, LLMAssistantContextAggregator
+from dailyai.services.ai_services import AIService, FrameLogger
+from dailyai.queue_frame import QueueFrame, AudioQueueFrame, LLMResponseEndQueueFrame, LLMMessagesQueueFrame
+from typing import AsyncGenerator
+
+logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") # or whatever
+logger = logging.getLogger("dailyai")
+logger.setLevel(logging.DEBUG)
+
+sounds = {}
+sound_files = [
+    'ding1.wav',
+    'ding2.wav'
+]
+
+script_dir = os.path.dirname(__file__)
+
+for file in sound_files:
+    # Build the full path to the image file
+    full_path = os.path.join(script_dir, "assets", file)
+    # Get the filename without the extension to use as the dictionary key
+    filename = os.path.splitext(os.path.basename(full_path))[0]
+    # Open the image and convert it to bytes
+    with wave.open(full_path) as audio_file:
+        sounds[file] = audio_file.readframes(-1)
+
+
+
+
+class OutboundSoundEffectWrapper(AIService):
+    def __init__(self):
+        pass
+
+    async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
+        if isinstance(frame, LLMResponseEndQueueFrame):
+            yield AudioQueueFrame(sounds["ding1.wav"])
+            # In case anything else up the stack needs it
+            yield frame
+        else:
+            yield frame
+
+class InboundSoundEffectWrapper(AIService):
+    def __init__(self):
+        pass
+
+    async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
+        if isinstance(frame, LLMMessagesQueueFrame):
+            yield AudioQueueFrame(sounds["ding2.wav"])
+            # In case anything else up the stack needs it
+            yield frame
+        else:
+            yield frame
+
+
+async def main(room_url: str, token):
+    global transport
+    global llm
+    global tts
+
+    transport = DailyTransportService(
+        room_url,
+        token,
+        "Respond bot",
+        5,
+    )
+    transport.mic_enabled = True
+    transport.mic_sample_rate = 16000
+    transport.camera_enabled = False
+
+    llm = AzureLLMService()
+    tts = AzureTTSService()
+
+    @transport.event_handler("on_first_other_participant_joined")
+    async def on_first_other_participant_joined(transport):
+        await tts.say("Hi, I'm listening!", transport.send_queue)
+        await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"]))
+    async def handle_transcriptions():
+        messages = [
+            {"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
+        ]
+
+        tma_in = LLMUserContextAggregator(
+            messages, transport.my_participant_id
+        )
+        tma_out = LLMAssistantContextAggregator(
+            messages, transport.my_participant_id
+        )
+        out_sound = OutboundSoundEffectWrapper()
+        in_sound = InboundSoundEffectWrapper()
+        fl = FrameLogger("LLM Out")
+        fl2 = FrameLogger("Transcription In")
+        await out_sound.run_to_queue(
+            transport.send_queue,
+            tts.run(
+                fl.run(
+                    tma_out.run(
+                        llm.run(
+                            fl2.run(
+                                in_sound.run(
+                                    tma_in.run(
+                                        transport.get_receive_frames()
+                                    )
+                                )
+                            )
+                        )
+                    )
+                )
+            )
+        )
+
+
+    transport.transcription_settings["extra"]["punctuate"] = True
+    await asyncio.gather(transport.run(), handle_transcriptions())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
+    parser.add_argument(
+        "-u", "--url", type=str, required=True, help="URL of the Daily room to join"
+    )
+    parser.add_argument(
+        "-k",
+        "--apikey",
+        type=str,
+        required=True,
+        help="Daily API Key (needed to create token)",
+    )
+
+    args, unknown = parser.parse_known_args()
+
+    # Create a meeting token for the given room with an expiration 1 hour in the future.
+    room_name: str = urllib.parse.urlparse(args.url).path[1:]
+    expiration: float = time.time() + 60 * 60
+
+    res: requests.Response = requests.post(
+        f"https://api.daily.co/v1/meeting-tokens",
+        headers={"Authorization": f"Bearer {args.apikey}"},
+        json={
+            "properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
+        },
+    )
+
+    if res.status_code != 200:
+        raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
+
+    token: str = res.json()["token"]
+
+    asyncio.run(main(args.url, token))