diff --git a/src/dailyai/queue_aggregators.py b/src/dailyai/queue_aggregators.py index 80c99bc19..182d7d479 100644 --- a/src/dailyai/queue_aggregators.py +++ b/src/dailyai/queue_aggregators.py @@ -61,13 +61,17 @@ async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, N # TODO: split up transcription by participant if self.complete_sentences: - self.sentence += frame.text # type: ignore -- the linter thinks this isn't a TextQueueFrame, even though we check it above + # type: ignore -- the linter thinks this isn't a TextQueueFrame, even + # though we check it above + self.sentence += frame.text if self.sentence.endswith((".", "?", "!")): self.messages.append({"role": self.role, "content": self.sentence}) self.sentence = "" yield LLMMessagesQueueFrame(self.messages) else: - self.messages.append({"role": self.role, "content": frame.text}) # type: ignore -- the linter thinks this isn't a TextQueueFrame, even though we check it above + # type: ignore -- the linter thinks this isn't a TextQueueFrame, even + # though we check it above + self.messages.append({"role": self.role, "content": frame.text}) yield LLMMessagesQueueFrame(self.messages) async def finalize(self) -> AsyncGenerator[QueueFrame, None]: @@ -79,9 +83,9 @@ async def finalize(self) -> AsyncGenerator[QueueFrame, None]: class LLMUserContextAggregator(LLMContextAggregator): def __init__(self, - messages: list[dict], - bot_participant_id=None, - complete_sentences=True): + messages: list[dict], + bot_participant_id=None, + complete_sentences=True): super().__init__(messages, "user", bot_participant_id, complete_sentences, pass_through=False) diff --git a/src/dailyai/queue_frame.py b/src/dailyai/queue_frame.py index 3e249b38c..d43dbdf82 100644 --- a/src/dailyai/queue_frame.py +++ b/src/dailyai/queue_frame.py @@ -18,9 +18,11 @@ class StartStreamQueueFrame(ControlQueueFrame): class EndStreamQueueFrame(ControlQueueFrame): pass + class LLMResponseEndQueueFrame(QueueFrame): pass + @dataclass() class AudioQueueFrame(QueueFrame): data: bytes diff --git a/src/dailyai/services/base_transport_service.py b/src/dailyai/services/base_transport_service.py index 139edc6ff..b05e7e373 100644 --- a/src/dailyai/services/base_transport_service.py +++ b/src/dailyai/services/base_transport_service.py @@ -16,6 +16,7 @@ StartStreamQueueFrame, ) + class BaseTransportService(): def __init__( diff --git a/src/dailyai/services/daily_transport_service.py b/src/dailyai/services/daily_transport_service.py index 575a6560b..2f07e6d63 100644 --- a/src/dailyai/services/daily_transport_service.py +++ b/src/dailyai/services/daily_transport_service.py @@ -45,7 +45,7 @@ def __init__( start_transcription: bool = False, **kwargs, ): - super().__init__(**kwargs) # This will call BaseTransportService.__init__ method, not EventHandler + super().__init__(**kwargs) # This will call BaseTransportService.__init__ method, not EventHandler self._room_url: str = room_url self._bot_name: str = bot_name diff --git a/src/dailyai/services/fal_ai_services.py b/src/dailyai/services/fal_ai_services.py index ca1f93d89..9464b46dd 100644 --- a/src/dailyai/services/fal_ai_services.py +++ b/src/dailyai/services/fal_ai_services.py @@ -13,7 +13,13 @@ class FalImageGenService(ImageGenService): - def __init__(self, *, image_size, aiohttp_session: aiohttp.ClientSession, key_id=None, key_secret=None): + def __init__( + self, + *, + image_size, + aiohttp_session: aiohttp.ClientSession, + key_id=None, + key_secret=None): super().__init__(image_size) self._aiohttp_session = aiohttp_session if key_id: diff --git a/src/dailyai/services/local_transport_service.py b/src/dailyai/services/local_transport_service.py index 91af6763c..7538d3b06 100644 --- a/src/dailyai/services/local_transport_service.py +++ b/src/dailyai/services/local_transport_service.py @@ -22,11 +22,15 @@ def __init__(self, **kwargs): async def _write_frame_to_tkinter(self, frame: bytes): data = f"P6 {self._camera_width} {self._camera_height} 255 ".encode() + frame - photo = tk.PhotoImage(width=self._camera_width, height=self._camera_height, data=data, format="PPM") + photo = tk.PhotoImage( + width=self._camera_width, + height=self._camera_height, + data=data, + format="PPM") self._image_label.config(image=photo) # This holds a reference to the photo, preventing it from being garbage collected. - self._image_label.image = photo # type: ignore + self._image_label.image = photo # type: ignore def write_frame_to_camera(self, frame: bytes): if self._camera_enabled and self._loop: diff --git a/src/examples/foundational/01-say-one-thing.py b/src/examples/foundational/01-say-one-thing.py index 989da84af..37136facf 100644 --- a/src/examples/foundational/01-say-one-thing.py +++ b/src/examples/foundational/01-say-one-thing.py @@ -7,6 +7,7 @@ from examples.foundational.support.runner import configure + async def main(room_url): async with aiohttp.ClientSession() as session: # create a transport service object using environment variables for @@ -25,7 +26,10 @@ async def main(room_url): meeting_duration_minutes, mic_enabled=True ) - tts = ElevenLabsTTSService(aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), voice_id=os.getenv("ELEVENLABS_VOICE_ID")) + tts = ElevenLabsTTSService( + aiohttp_session=session, + api_key=os.getenv("ELEVENLABS_API_KEY"), + voice_id=os.getenv("ELEVENLABS_VOICE_ID")) # Register an event handler so we can play the audio when the participant joins. @transport.event_handler("on_participant_joined") diff --git a/src/examples/foundational/02-llm-say-one-thing.py b/src/examples/foundational/02-llm-say-one-thing.py index 7a30a906f..b15023380 100644 --- a/src/examples/foundational/02-llm-say-one-thing.py +++ b/src/examples/foundational/02-llm-say-one-thing.py @@ -11,6 +11,7 @@ from dailyai.services.open_ai_services import OpenAILLMService from examples.foundational.support.runner import configure + async def main(room_url): async with aiohttp.ClientSession() as session: meeting_duration_minutes = 1 @@ -22,12 +23,18 @@ async def main(room_url): mic_enabled=True ) - tts = ElevenLabsTTSService(aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), voice_id=os.getenv("ELEVENLABS_VOICE_ID")) + tts = ElevenLabsTTSService( + aiohttp_session=session, + api_key=os.getenv("ELEVENLABS_API_KEY"), + voice_id=os.getenv("ELEVENLABS_VOICE_ID")) # tts = AzureTTSService(api_key=os.getenv("AZURE_SPEECH_API_KEY"), region=os.getenv("AZURE_SPEECH_REGION")) # tts = DeepgramTTSService(aiohttp_session=session, api_key=os.getenv("DEEPGRAM_API_KEY"), voice=os.getenv("DEEPGRAM_VOICE")) - llm = AzureLLMService(api_key=os.getenv("AZURE_CHATGPT_API_KEY"), endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), model=os.getenv("AZURE_CHATGPT_MODEL")) - #llm = OpenAILLMService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY")) + llm = AzureLLMService( + api_key=os.getenv("AZURE_CHATGPT_API_KEY"), + endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), + model=os.getenv("AZURE_CHATGPT_MODEL")) + # llm = OpenAILLMService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY")) messages = [{ "role": "system", "content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world." diff --git a/src/examples/foundational/03-still-frame.py b/src/examples/foundational/03-still-frame.py index 06d40448b..de368d5d6 100644 --- a/src/examples/foundational/03-still-frame.py +++ b/src/examples/foundational/03-still-frame.py @@ -28,7 +28,11 @@ async def main(room_url): camera_height=1024 ) - imagegen = FalImageGenService(image_size="1024x1024", aiohttp_session=session, key_id=os.getenv("FAL_KEY_ID"), key_secret=os.getenv("FAL_KEY_SECRET")) + imagegen = FalImageGenService( + image_size="1024x1024", + aiohttp_session=session, + key_id=os.getenv("FAL_KEY_ID"), + key_secret=os.getenv("FAL_KEY_SECRET")) # imagegen = OpenAIImageGenService(aiohttp_session=session, api_key=os.getenv("OPENAI_DALLE_API_KEY"), image_size="1024x1024") # imagegen = AzureImageGenServiceREST(image_size="1024x1024", aiohttp_session=session, api_key=os.getenv("AZURE_DALLE_API_KEY"), endpoint=os.getenv("AZURE_DALLE_ENDPOINT"), model=os.getenv("AZURE_DALLE_MODEL")) diff --git a/src/examples/foundational/04-utterance-and-speech.py b/src/examples/foundational/04-utterance-and-speech.py index 18e7c2ff7..17bd32797 100644 --- a/src/examples/foundational/04-utterance-and-speech.py +++ b/src/examples/foundational/04-utterance-and-speech.py @@ -10,6 +10,7 @@ from examples.foundational.support.runner import configure + async def main(room_url: str): async with aiohttp.ClientSession() as session: transport = DailyTransportService( @@ -22,9 +23,17 @@ async def main(room_url: str): camera_enabled=False ) - llm = AzureLLMService(api_key=os.getenv("AZURE_CHATGPT_API_KEY"), endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), model=os.getenv("AZURE_CHATGPT_MODEL")) - azure_tts = AzureTTSService(api_key=os.getenv("AZURE_SPEECH_API_KEY"), region=os.getenv("AZURE_SPEECH_REGION")) - elevenlabs_tts = ElevenLabsTTSService(aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), voice_id=os.getenv("ELEVENLABS_VOICE_ID")) + llm = AzureLLMService( + api_key=os.getenv("AZURE_CHATGPT_API_KEY"), + endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), + model=os.getenv("AZURE_CHATGPT_MODEL")) + azure_tts = AzureTTSService( + api_key=os.getenv("AZURE_SPEECH_API_KEY"), + region=os.getenv("AZURE_SPEECH_REGION")) + elevenlabs_tts = ElevenLabsTTSService( + aiohttp_session=session, + api_key=os.getenv("ELEVENLABS_API_KEY"), + voice_id=os.getenv("ELEVENLABS_VOICE_ID")) messages = [{"role": "system", "content": "tell the user a joke about llamas"}] diff --git a/src/examples/foundational/05-sync-speech-and-image.py b/src/examples/foundational/05-sync-speech-and-image.py index 55e4ec9f7..dfef2bc15 100644 --- a/src/examples/foundational/05-sync-speech-and-image.py +++ b/src/examples/foundational/05-sync-speech-and-image.py @@ -11,6 +11,7 @@ from examples.foundational.support.runner import configure + async def main(room_url): async with aiohttp.ClientSession() as session: meeting_duration_minutes = 5 @@ -26,11 +27,21 @@ async def main(room_url): camera_height=1024 ) - llm = AzureLLMService(api_key=os.getenv("AZURE_CHATGPT_API_KEY"), endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), model=os.getenv("AZURE_CHATGPT_MODEL")) - tts = ElevenLabsTTSService(aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), voice_id="ErXwobaYiN019PkySvjV") + llm = AzureLLMService( + api_key=os.getenv("AZURE_CHATGPT_API_KEY"), + endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), + model=os.getenv("AZURE_CHATGPT_MODEL")) + tts = ElevenLabsTTSService( + aiohttp_session=session, + api_key=os.getenv("ELEVENLABS_API_KEY"), + voice_id="ErXwobaYiN019PkySvjV") # tts = AzureTTSService(api_key=os.getenv("AZURE_SPEECH_API_KEY"), region=os.getenv("AZURE_SPEECH_REGION")) - dalle = FalImageGenService(image_size="1024x1024", aiohttp_session=session, key_id=os.getenv("FAL_KEY_ID"), key_secret=os.getenv("FAL_KEY_SECRET")) + dalle = FalImageGenService( + image_size="1024x1024", + aiohttp_session=session, + key_id=os.getenv("FAL_KEY_ID"), + key_secret=os.getenv("FAL_KEY_SECRET")) # dalle = OpenAIImageGenService(aiohttp_session=session, api_key=os.getenv("OPENAI_DALLE_API_KEY"), image_size="1024x1024") # dalle = AzureImageGenServiceREST(image_size="1024x1024", aiohttp_session=session, api_key=os.getenv("AZURE_DALLE_API_KEY"), endpoint=os.getenv("AZURE_DALLE_ENDPOINT"), model=os.getenv("AZURE_DALLE_MODEL")) diff --git a/src/examples/foundational/06-listen-and-respond.py b/src/examples/foundational/06-listen-and-respond.py index df8895731..fa5e077cc 100644 --- a/src/examples/foundational/06-listen-and-respond.py +++ b/src/examples/foundational/06-listen-and-respond.py @@ -6,6 +6,7 @@ from dailyai.queue_aggregators import LLMAssistantContextAggregator, LLMContextAggregator, LLMUserContextAggregator from examples.foundational.support.runner import configure + async def main(room_url: str, token): transport = DailyTransportService( room_url, @@ -15,11 +16,16 @@ async def main(room_url: str, token): start_transcription=True, mic_enabled=True, mic_sample_rate=16000, - camera_enabled = False + camera_enabled=False ) - llm = AzureLLMService(api_key=os.getenv("AZURE_CHATGPT_API_KEY"), endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), model=os.getenv("AZURE_CHATGPT_MODEL")) - tts = AzureTTSService(api_key=os.getenv("AZURE_SPEECH_API_KEY"), region=os.getenv("AZURE_SPEECH_REGION")) + llm = AzureLLMService( + api_key=os.getenv("AZURE_CHATGPT_API_KEY"), + endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), + model=os.getenv("AZURE_CHATGPT_MODEL")) + tts = AzureTTSService( + api_key=os.getenv("AZURE_SPEECH_API_KEY"), + region=os.getenv("AZURE_SPEECH_REGION")) @transport.event_handler("on_first_other_participant_joined") async def on_first_other_participant_joined(transport): diff --git a/src/examples/foundational/06a-image-sync.py b/src/examples/foundational/06a-image-sync.py index fbef24364..032d828c1 100644 --- a/src/examples/foundational/06a-image-sync.py +++ b/src/examples/foundational/06a-image-sync.py @@ -18,6 +18,7 @@ from examples.foundational.support.runner import configure + class ImageSyncAggregator(AIService): def __init__(self, speaking_path: str, waiting_path: str): self._speaking_image = Image.open(speaking_path) @@ -46,9 +47,18 @@ async def main(room_url: str, token): transport._mic_enabled = True transport._mic_sample_rate = 16000 - llm = AzureLLMService(api_key=os.getenv("AZURE_CHATGPT_API_KEY"), endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), model=os.getenv("AZURE_CHATGPT_MODEL")) - tts = AzureTTSService(api_key=os.getenv("AZURE_SPEECH_API_KEY"), region=os.getenv("AZURE_SPEECH_REGION")) - img = FalImageGenService(image_size="1024x1024", aiohttp_session=session, key_id=os.getenv("FAL_KEY_ID"), key_secret=os.getenv("FAL_KEY_SECRET")) + llm = AzureLLMService( + api_key=os.getenv("AZURE_CHATGPT_API_KEY"), + endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), + model=os.getenv("AZURE_CHATGPT_MODEL")) + tts = AzureTTSService( + api_key=os.getenv("AZURE_SPEECH_API_KEY"), + region=os.getenv("AZURE_SPEECH_REGION")) + img = FalImageGenService( + image_size="1024x1024", + aiohttp_session=session, + key_id=os.getenv("FAL_KEY_ID"), + key_secret=os.getenv("FAL_KEY_SECRET")) async def get_images(): get_speaking_task = asyncio.create_task( diff --git a/src/examples/foundational/07-interruptible.py b/src/examples/foundational/07-interruptible.py index cd7f9de99..6cae19f5c 100644 --- a/src/examples/foundational/07-interruptible.py +++ b/src/examples/foundational/07-interruptible.py @@ -10,6 +10,7 @@ from examples.foundational.support.runner import configure + async def main(room_url: str, token): async with aiohttp.ClientSession() as session: transport = DailyTransportService( @@ -23,8 +24,13 @@ async def main(room_url: str, token): camera_enabled=False, ) - llm = AzureLLMService(api_key=os.getenv("AZURE_CHATGPT_API_KEY"), endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), model=os.getenv("AZURE_CHATGPT_MODEL")) - tts = AzureTTSService(api_key=os.getenv("AZURE_SPEECH_API_KEY"), region=os.getenv("AZURE_SPEECH_REGION")) + llm = AzureLLMService( + api_key=os.getenv("AZURE_CHATGPT_API_KEY"), + endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), + model=os.getenv("AZURE_CHATGPT_MODEL")) + tts = AzureTTSService( + api_key=os.getenv("AZURE_SPEECH_API_KEY"), + region=os.getenv("AZURE_SPEECH_REGION")) async def run_response(user_speech, tma_in, tma_out): await tts.run_to_queue( diff --git a/src/examples/foundational/08-bots-arguing.py b/src/examples/foundational/08-bots-arguing.py index 95fc62e40..a5329086e 100644 --- a/src/examples/foundational/08-bots-arguing.py +++ b/src/examples/foundational/08-bots-arguing.py @@ -10,7 +10,8 @@ from examples.foundational.support.runner import configure -async def main(room_url:str): + +async def main(room_url: str): async with aiohttp.ClientSession() as session: transport = DailyTransportService( room_url, @@ -24,16 +25,30 @@ async def main(room_url:str): camera_height=1024 ) - llm = AzureLLMService(api_key=os.getenv("AZURE_CHATGPT_API_KEY"), endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), model=os.getenv("AZURE_CHATGPT_MODEL")) - tts1 = AzureTTSService(api_key=os.getenv("AZURE_SPEECH_API_KEY"), region=os.getenv("AZURE_SPEECH_REGION")) - tts2 = ElevenLabsTTSService(aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), voice_id="jBpfuIE2acCO8z3wKNLl") - dalle = FalImageGenService(image_size="1024x1024", aiohttp_session=session, key_id=os.getenv("FAL_KEY_ID"), key_secret=os.getenv("FAL_KEY_SECRET")) + llm = AzureLLMService( + api_key=os.getenv("AZURE_CHATGPT_API_KEY"), + endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), + model=os.getenv("AZURE_CHATGPT_MODEL")) + tts1 = AzureTTSService( + api_key=os.getenv("AZURE_SPEECH_API_KEY"), + region=os.getenv("AZURE_SPEECH_REGION")) + tts2 = ElevenLabsTTSService( + aiohttp_session=session, + api_key=os.getenv("ELEVENLABS_API_KEY"), + voice_id="jBpfuIE2acCO8z3wKNLl") + dalle = FalImageGenService( + image_size="1024x1024", + aiohttp_session=session, + key_id=os.getenv("FAL_KEY_ID"), + key_secret=os.getenv("FAL_KEY_SECRET")) bot1_messages = [ {"role": "system", "content": "You are a stern librarian. You strongly believe that a hot dog is a sandwich. Start by stating this fact in a few sentences, then be prepared to debate this with the user. You shouldn't ever compromise on the fundamental truth that a hot dog is a sandwich. Your responses should only be a few sentences long."}, ] bot2_messages = [ - {"role": "system", "content": "You are a silly cat, and you strongly believe that a hot dog is not a sandwich. Debate this with the user, only responding with a few sentences. Don't ever accept that a hot dog is a sandwich."}, + { + "role": "system", + "content": "You are a silly cat, and you strongly believe that a hot dog is not a sandwich. Debate this with the user, only responding with a few sentences. Don't ever accept that a hot dog is a sandwich."}, ] async def get_bot1_statement(): diff --git a/src/examples/foundational/10-wake-word.py b/src/examples/foundational/10-wake-word.py index ccf706d50..54db2468c 100644 --- a/src/examples/foundational/10-wake-word.py +++ b/src/examples/foundational/10-wake-word.py @@ -71,7 +71,7 @@ async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, N class NameCheckFilter(AIService): - def __init__(self, names:list[str]): + def __init__(self, names: list[str]): self.names = names self.sentence = "" @@ -123,8 +123,14 @@ async def main(room_url: str, token): transport._camera_width = 720 transport._camera_height = 1280 - llm = AzureLLMService(api_key=os.getenv("AZURE_CHATGPT_API_KEY"), endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), model=os.getenv("AZURE_CHATGPT_MODEL")) - tts = ElevenLabsTTSService(aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), voice_id="jBpfuIE2acCO8z3wKNLl") + llm = AzureLLMService( + api_key=os.getenv("AZURE_CHATGPT_API_KEY"), + endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), + model=os.getenv("AZURE_CHATGPT_MODEL")) + tts = ElevenLabsTTSService( + aiohttp_session=session, + api_key=os.getenv("ELEVENLABS_API_KEY"), + voice_id="jBpfuIE2acCO8z3wKNLl") isa = ImageSyncAggregator() @transport.event_handler("on_first_other_participant_joined") diff --git a/src/examples/foundational/11-sound-effects.py b/src/examples/foundational/11-sound-effects.py index 743120f23..954f363d7 100644 --- a/src/examples/foundational/11-sound-effects.py +++ b/src/examples/foundational/11-sound-effects.py @@ -14,7 +14,7 @@ from examples.foundational.support.runner import configure -logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") # or whatever +logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") # or whatever logger = logging.getLogger("dailyai") logger.setLevel(logging.DEBUG) @@ -36,8 +36,6 @@ sounds[file] = audio_file.readframes(-1) - - class OutboundSoundEffectWrapper(AIService): def __init__(self): pass @@ -50,6 +48,7 @@ async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, N else: yield frame + class InboundSoundEffectWrapper(AIService): def __init__(self): pass @@ -75,14 +74,20 @@ async def main(room_url: str, token): camera_enabled=False ) - llm = AzureLLMService(api_key=os.getenv("AZURE_CHATGPT_API_KEY"), endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), model=os.getenv("AZURE_CHATGPT_MODEL")) - tts = ElevenLabsTTSService(aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), voice_id="ErXwobaYiN019PkySvjV") - + llm = AzureLLMService( + api_key=os.getenv("AZURE_CHATGPT_API_KEY"), + endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), + model=os.getenv("AZURE_CHATGPT_MODEL")) + tts = ElevenLabsTTSService( + aiohttp_session=session, + api_key=os.getenv("ELEVENLABS_API_KEY"), + voice_id="ErXwobaYiN019PkySvjV") @transport.event_handler("on_first_other_participant_joined") async def on_first_other_participant_joined(transport): await tts.say("Hi, I'm listening!", transport.send_queue) await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"])) + async def handle_transcriptions(): messages = [ {"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."}, @@ -117,7 +122,6 @@ async def handle_transcriptions(): ) ) - transport.transcription_settings["extra"]["punctuate"] = True await asyncio.gather(transport.run(), handle_transcriptions()) diff --git a/src/examples/foundational/13-whisper-transcription.py b/src/examples/foundational/13-whisper-transcription.py index 147b20e22..726616c5f 100644 --- a/src/examples/foundational/13-whisper-transcription.py +++ b/src/examples/foundational/13-whisper-transcription.py @@ -5,6 +5,7 @@ from examples.foundational.support.runner import configure + async def main(room_url: str): transport = DailyTransportService( room_url, diff --git a/src/examples/foundational/13a-whisper-local.py b/src/examples/foundational/13a-whisper-local.py index 0a30ad721..6c764c1a9 100644 --- a/src/examples/foundational/13a-whisper-local.py +++ b/src/examples/foundational/13a-whisper-local.py @@ -17,7 +17,7 @@ async def main(room_url: str): camera_enabled=False, speaker_enabled=True, duration_minutes=meeting_duration_minutes, - start_transcription = True + start_transcription=True ) stt = WhisperSTTService() transcription_output_queue = asyncio.Queue() diff --git a/src/examples/foundational/support/runner.py b/src/examples/foundational/support/runner.py index 94263c0a6..b4b7d4862 100644 --- a/src/examples/foundational/support/runner.py +++ b/src/examples/foundational/support/runner.py @@ -7,6 +7,7 @@ from dotenv import load_dotenv load_dotenv() + def configure(): parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample") parser.add_argument( @@ -26,11 +27,11 @@ def configure(): key = args.apikey or os.getenv("DAILY_API_KEY") if not url: - raise Exception("No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.") - + raise Exception( + "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.") + if not key: raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.") - # Create a meeting token for the given room with an expiration 1 hour in the future. room_name: str = urllib.parse.urlparse(url).path[1:] @@ -49,4 +50,4 @@ def configure(): token: str = res.json()["token"] - return (url, token) \ No newline at end of file + return (url, token) diff --git a/src/examples/internal/11a-dial-out.py b/src/examples/internal/11a-dial-out.py index 9d38299f4..b16c72ed1 100644 --- a/src/examples/internal/11a-dial-out.py +++ b/src/examples/internal/11a-dial-out.py @@ -30,8 +30,6 @@ sounds[file] = audio_file.readframes(-1) - - class OutboundSoundEffectWrapper(AIService): def __init__(self): pass @@ -44,6 +42,7 @@ async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, N else: yield frame + class InboundSoundEffectWrapper(AIService): def __init__(self): pass @@ -81,6 +80,7 @@ async def main(room_url: str, token, phone): async def on_first_other_participant_joined(transport): await tts.say("Hi, I'm listening!", transport.send_queue) await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"])) + async def handle_transcriptions(): messages = [ {"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."}, @@ -124,7 +124,6 @@ async def on_call_state_updated(transport, state): transport.start_recording() transport.dialout(phone) - transport.transcription_settings["extra"]["punctuate"] = True await asyncio.gather(transport.run(), handle_transcriptions())