diff --git a/dot-env.template b/dot-env.template index bbbef51ab..06b8a36eb 100644 --- a/dot-env.template +++ b/dot-env.template @@ -2,8 +2,16 @@ ANTHROPIC_API_KEY=... # Azure -SPEECH_KEY=... -SPEECH_REGION=... +AZURE_SPEECH_REGION=... +AZURE_SPEECH_API_KEY=... + +AZURE_CHATGPT_API_KEY=... +AZURE_CHATGPT_ENDPOINT=https://... +AZURE_CHATGPT_MODEL=... + +AZURE_DALLE_API_KEY=... +AZURE_DALLE_ENDPOINT=https://... +AZURE_DALLE_MODEL=... # Daily DAILY_API_KEY=... diff --git a/examples/foundational/to_be_updated/05a-local-sync-speech-and-text.py b/examples/foundational/05a-local-sync-speech-and-text.py similarity index 79% rename from examples/foundational/to_be_updated/05a-local-sync-speech-and-text.py rename to examples/foundational/05a-local-sync-speech-and-text.py index 85c2dc204..e19988a80 100644 --- a/examples/foundational/to_be_updated/05a-local-sync-speech-and-text.py +++ b/examples/foundational/05a-local-sync-speech-and-text.py @@ -3,8 +3,9 @@ import logging import tkinter as tk import os +from dailyai.pipeline.aggregators import LLMFullResponseAggregator -from dailyai.pipeline.frames import AudioFrame, ImageFrame +from dailyai.pipeline.frames import AudioFrame, ImageFrame, LLMMessagesFrame, TextFrame from dailyai.services.open_ai_services import OpenAILLMService from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService from dailyai.services.fal_ai_services import FalImageGenService @@ -22,7 +23,7 @@ async def main(): async with aiohttp.ClientSession() as session: meeting_duration_minutes = 5 tk_root = tk.Tk() - tk_root.title("Calendar") + tk_root.title("dailyai") transport = LocalTransport( mic_enabled=True, @@ -43,7 +44,7 @@ async def main(): api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4-turbo-preview") - dalle = FalImageGenService( + imagegen = FalImageGenService( image_size="1024x1024", aiohttp_session=session, key_id=os.getenv("FAL_KEY_ID"), @@ -60,18 +61,33 @@ async def get_all_audio(text): return all_audio + async def get_month_description(aggregator, frame): + async for frame in aggregator.process_frame(frame): + if isinstance(frame, TextFrame): + return frame.text + async def get_month_data(month): messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of { month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }] - image_description = await llm.run_llm(messages) + messages_frame = LLMMessagesFrame(messages) + + llm_full_response_aggregator = LLMFullResponseAggregator() + + image_description = None + async for frame in llm.process_frame(messages_frame): + result = await get_month_description(llm_full_response_aggregator, frame) + if result: + image_description = result + break + if not image_description: return to_speak = f"{month}: {image_description}" audio_task = asyncio.create_task(get_all_audio(to_speak)) image_task = asyncio.create_task( - dalle.run_image_gen(image_description)) + imagegen.run_image_gen(image_description)) (audio, image_data) = await asyncio.gather(audio_task, image_task) return { @@ -82,19 +98,14 @@ async def get_month_data(month): "audio": audio, } + # We only specify 5 months as we create tasks all at once and we might + # get rate limited otherwise. months: list[str] = [ "January", "February", "March", "April", "May", - "June", - "July", - "August", - "September", - "October", - "November", - "December", ] async def show_images(): diff --git a/examples/foundational/to_be_updated/06a-image-sync.py b/examples/foundational/06a-image-sync.py similarity index 53% rename from examples/foundational/to_be_updated/06a-image-sync.py rename to examples/foundational/06a-image-sync.py index 8a517b448..3b25076ba 100644 --- a/examples/foundational/to_be_updated/06a-image-sync.py +++ b/examples/foundational/06a-image-sync.py @@ -5,7 +5,8 @@ import aiohttp from PIL import Image -from dailyai.pipeline.frames import ImageFrame, Frame +from dailyai.pipeline.frames import ImageFrame, Frame, TextFrame +from dailyai.pipeline.pipeline import Pipeline from dailyai.transports.daily_transport import DailyTransport from dailyai.services.ai_services import AIService from dailyai.pipeline.aggregators import ( @@ -14,7 +15,6 @@ ) from dailyai.services.open_ai_services import OpenAILLMService from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService -from dailyai.services.fal_ai_services import FalImageGenService from runner import configure @@ -53,6 +53,7 @@ async def main(room_url: str, token): transport._camera_height = 1024 transport._mic_enabled = True transport._mic_sample_rate = 16000 + transport.transcription_settings["extra"]["punctuate"] = True tts = ElevenLabsTTSService( aiohttp_session=session, @@ -64,57 +65,30 @@ async def main(room_url: str, token): api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4-turbo-preview") - img = FalImageGenService( - image_size="1024x1024", - aiohttp_session=session, - key_id=os.getenv("FAL_KEY_ID"), - key_secret=os.getenv("FAL_KEY_SECRET"), + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so it should not include any special characters. Respond to what the user said in a creative and helpful way.", + }, + ] + + tma_in = LLMUserContextAggregator( + messages, transport._my_participant_id) + tma_out = LLMAssistantContextAggregator( + messages, transport._my_participant_id + ) + image_sync_aggregator = ImageSyncAggregator( + os.path.join(os.path.dirname(__file__), "assets", "speaking.png"), + os.path.join(os.path.dirname(__file__), "assets", "waiting.png"), ) - async def get_images(): - get_speaking_task = asyncio.create_task( - img.run_image_gen("An image of a cat speaking") - ) - get_waiting_task = asyncio.create_task( - img.run_image_gen("An image of a cat waiting") - ) - - (speaking_data, waiting_data) = await asyncio.gather( - get_speaking_task, get_waiting_task - ) - - return speaking_data, waiting_data + pipeline = Pipeline([image_sync_aggregator, tma_in, llm, tma_out, tts]) @transport.event_handler("on_first_other_participant_joined") async def on_first_other_participant_joined(transport): - await tts.say("Hi, I'm listening!", transport.send_queue) - - async def handle_transcriptions(): - messages = [ - { - "role": "system", - "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.", - }, - ] - - tma_in = LLMUserContextAggregator( - messages, transport._my_participant_id) - tma_out = LLMAssistantContextAggregator( - messages, transport._my_participant_id - ) - image_sync_aggregator = ImageSyncAggregator( - os.path.join( - os.path.dirname(__file__), "assets", "speaking.png"), os.path.join( - os.path.dirname(__file__), "assets", "waiting.png"), ) - await tts.run_to_queue( - transport.send_queue, - image_sync_aggregator.run( - tma_out.run(llm.run(tma_in.run(transport.get_receive_frames()))) - ), - ) + await pipeline.queue_frames([TextFrame("Hi, I'm listening!")]) - transport.transcription_settings["extra"]["punctuate"] = True - await asyncio.gather(transport.run(), handle_transcriptions()) + await transport.run(pipeline) if __name__ == "__main__": diff --git a/examples/foundational/to_be_updated/10-wake-word.py b/examples/foundational/10-wake-word.py similarity index 79% rename from examples/foundational/to_be_updated/10-wake-word.py rename to examples/foundational/10-wake-word.py index 914910774..e07d40950 100644 --- a/examples/foundational/to_be_updated/10-wake-word.py +++ b/examples/foundational/10-wake-word.py @@ -5,6 +5,7 @@ import random from typing import AsyncGenerator from PIL import Image +from dailyai.pipeline.pipeline import Pipeline from dailyai.transports.daily_transport import DailyTransport from dailyai.services.open_ai_services import OpenAILLMService @@ -133,6 +134,7 @@ async def main(room_url: str, token): transport._camera_enabled = True transport._camera_width = 720 transport._camera_height = 1280 + transport.transcription_settings["extra"]["punctuate"] = True llm = OpenAILLMService( api_key=os.getenv("OPENAI_API_KEY"), @@ -145,45 +147,34 @@ async def main(room_url: str, token): ) isa = ImageSyncAggregator() + messages = [ + { + "role": "system", + "content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long.", + }, + ] + + tma_in = LLMUserContextAggregator( + messages, transport._my_participant_id) + tma_out = LLMAssistantContextAggregator( + messages, transport._my_participant_id + ) + tf = TranscriptFilter(transport._my_participant_id) + ncf = NameCheckFilter(["Santa Cat", "Santa"]) + + pipeline = Pipeline([isa, tf, ncf, tma_in, llm, tma_out, tts]) + @transport.event_handler("on_first_other_participant_joined") async def on_first_other_participant_joined(transport): - await tts.say( + await transport.say( "Hi! If you want to talk to me, just say 'hey Santa Cat'.", - transport.send_queue, - ) - - async def handle_transcriptions(): - messages = [ - { - "role": "system", - "content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long.", - }, - ] - - tma_in = LLMUserContextAggregator( - messages, transport._my_participant_id) - tma_out = LLMAssistantContextAggregator( - messages, transport._my_participant_id - ) - tf = TranscriptFilter(transport._my_participant_id) - ncf = NameCheckFilter(["Santa Cat", "Santa"]) - await tts.run_to_queue( - transport.send_queue, - isa.run( - tma_out.run( - llm.run( - tma_in.run( - ncf.run(tf.run(transport.get_receive_frames()))) - ) - ) - ), + tts, ) async def starting_image(): await transport.send_queue.put(quiet_frame) - transport.transcription_settings["extra"]["punctuate"] = True - await asyncio.gather(transport.run(), handle_transcriptions(), starting_image()) + await asyncio.gather(transport.run(pipeline), starting_image()) if __name__ == "__main__": diff --git a/examples/foundational/to_be_updated/11-sound-effects.py b/examples/foundational/11-sound-effects.py similarity index 67% rename from examples/foundational/to_be_updated/11-sound-effects.py rename to examples/foundational/11-sound-effects.py index 412e358ee..a6d36cd1c 100644 --- a/examples/foundational/to_be_updated/11-sound-effects.py +++ b/examples/foundational/11-sound-effects.py @@ -3,6 +3,7 @@ import logging import os import wave +from dailyai.pipeline.pipeline import Pipeline from dailyai.transports.daily_transport import DailyTransport from dailyai.services.open_ai_services import OpenAILLMService @@ -81,6 +82,7 @@ async def main(room_url: str, token): mic_sample_rate=16000, camera_enabled=False, ) + transport.transcription_settings["extra"]["punctuate"] = True llm = OpenAILLMService( api_key=os.getenv("OPENAI_API_KEY"), @@ -92,47 +94,31 @@ async def main(room_url: str, token): voice_id="ErXwobaYiN019PkySvjV", ) + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.", + }, + ] + + tma_in = LLMUserContextAggregator( + messages, transport._my_participant_id) + tma_out = LLMAssistantContextAggregator( + messages, transport._my_participant_id + ) + out_sound = OutboundSoundEffectWrapper() + in_sound = InboundSoundEffectWrapper() + fl = FrameLogger("LLM Out") + fl2 = FrameLogger("Transcription In") + + pipeline = Pipeline([tma_in, in_sound, fl2, llm, tma_out, fl, tts, out_sound]) + @transport.event_handler("on_first_other_participant_joined") async def on_first_other_participant_joined(transport): - await tts.say("Hi, I'm listening!", transport.send_queue) + await transport.say("Hi, I'm listening!", tts) await transport.send_queue.put(AudioFrame(sounds["ding1.wav"])) - async def handle_transcriptions(): - messages = [ - { - "role": "system", - "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.", - }, - ] - - tma_in = LLMUserContextAggregator( - messages, transport._my_participant_id) - tma_out = LLMAssistantContextAggregator( - messages, transport._my_participant_id - ) - out_sound = OutboundSoundEffectWrapper() - in_sound = InboundSoundEffectWrapper() - fl = FrameLogger("LLM Out") - fl2 = FrameLogger("Transcription In") - await out_sound.run_to_queue( - transport.send_queue, - tts.run( - fl.run( - tma_out.run( - llm.run( - fl2.run( - in_sound.run( - tma_in.run(transport.get_receive_frames()) - ) - ) - ) - ) - ) - ), - ) - - transport.transcription_settings["extra"]["punctuate"] = True - await asyncio.gather(transport.run(), handle_transcriptions()) + await asyncio.gather(transport.run(pipeline)) if __name__ == "__main__": diff --git a/src/dailyai/services/azure_ai_services.py b/src/dailyai/services/azure_ai_services.py index 0f0151144..e9b15ec30 100644 --- a/src/dailyai/services/azure_ai_services.py +++ b/src/dailyai/services/azure_ai_services.py @@ -19,7 +19,7 @@ except ModuleNotFoundError as e: print(f"Exception: {e}") print( - "In order to use Azure TTS, you need to `pip install dailyai[azure]`. Also, set `SPEECH_KEY` and `SPEECH_REGION` environment variables.") + "In order to use Azure TTS, you need to `pip install dailyai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.") raise Exception(f"Missing module: {e}") from dailyai.services.openai_api_llm_service import BaseOpenAILLMService