diff --git a/src/dailyai/services/azure_ai_services.py b/src/dailyai/services/azure_ai_services.py index 9ffa52a46..a401e2e79 100644 --- a/src/dailyai/services/azure_ai_services.py +++ b/src/dailyai/services/azure_ai_services.py @@ -25,20 +25,21 @@ class AzureTTSService(TTSService): - def __init__(self, *, api_key, region): + def __init__(self, *, api_key, region, voice="en-US-SaraNeural"): super().__init__() self.speech_config = SpeechConfig(subscription=api_key, region=region) self.speech_synthesizer = SpeechSynthesizer( speech_config=self.speech_config, audio_config=None ) + self._voice = voice async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]: self.logger.info("Running azure tts") ssml = ( "" - "" + f"" "" "" "" diff --git a/src/dailyai/services/elevenlabs_ai_service.py b/src/dailyai/services/elevenlabs_ai_service.py index e1795aab3..07068b9dd 100644 --- a/src/dailyai/services/elevenlabs_ai_service.py +++ b/src/dailyai/services/elevenlabs_ai_service.py @@ -16,16 +16,18 @@ def __init__( aiohttp_session: aiohttp.ClientSession, api_key, voice_id, + model="eleven_turbo_v2", ): super().__init__() self._api_key = api_key self._voice_id = voice_id self._aiohttp_session = aiohttp_session + self._model = model async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]: url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream" - payload = {"text": sentence, "model_id": "eleven_turbo_v2"} + payload = {"text": sentence, "model_id": self._model} querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2} headers = { "xi-api-key": self._api_key, diff --git a/src/examples/starter-apps/translator.py b/src/examples/starter-apps/translator.py new file mode 100644 index 000000000..0e1ae5af3 --- /dev/null +++ b/src/examples/starter-apps/translator.py @@ -0,0 +1,84 @@ +import asyncio +import aiohttp +import logging +import os +from PIL import Image +from typing import AsyncGenerator + +from dailyai.pipeline.aggregators import ( + LLMResponseAggregator, + UserResponseAggregator, + SentenceAggregator, +) +from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, TextFrame +from dailyai.pipeline.frame_processor import FrameProcessor +from dailyai.services.ai_services import AIService, FrameLogger +from dailyai.pipeline.pipeline import Pipeline +from dailyai.services.daily_transport_service import DailyTransportService +from dailyai.services.azure_ai_services import AzureTTSService +from dailyai.services.open_ai_services import OpenAILLMService +from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService +from examples.support.runner import configure + +logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") +logger = logging.getLogger("dailyai") +logger.setLevel(logging.DEBUG) + +""" +This example looks a bit different than the chatbot example, because it isn't waiting on the user to stop talking to start translating. +It also isn't saving what the user or bot says into the context object for use in subsequent interactions. +""" + + +# We need to use a custom service here to yield LLM frames without saving any context +class TranslationProcessor(FrameProcessor): + def __init__(self, language): + self._language = language + + async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: + if isinstance(frame, TextFrame): + context = [ + { + "role": "system", + "content": f"You will be provided with a sentence in English, and your task is to translate it into {self._language}.", + }, + {"role": "user", "content": frame.text}, + ] + yield LLMMessagesQueueFrame(context) + else: + yield frame + + +async def main(room_url: str, token): + async with aiohttp.ClientSession() as session: + transport = DailyTransportService( + room_url, + token, + "Translator", + duration_minutes=5, + start_transcription=True, + mic_enabled=True, + mic_sample_rate=16000, + camera_enabled=False, + vad_enabled=True, + ) + tts = AzureTTSService( + api_key=os.getenv("AZURE_SPEECH_API_KEY"), + region=os.getenv("AZURE_SPEECH_REGION"), + voice="es-ES-AlvaroNeural", + ) + llm = OpenAILLMService( + api_key=os.getenv("OPENAI_CHATGPT_API_KEY"), model="gpt-4-turbo-preview" + ) + sa = SentenceAggregator() + tp = TranslationProcessor("Spanish") + pipeline = Pipeline([sa, tp, llm, tts]) + + transport.transcription_settings["extra"]["endpointing"] = True + transport.transcription_settings["extra"]["punctuate"] = True + await transport.run(pipeline) + + +if __name__ == "__main__": + (url, token) = configure() + asyncio.run(main(url, token))