diff --git a/CHANGELOG.md b/CHANGELOG.md index 319c92f12..84a6ae873 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- Fixed Deepgram Aura TTS base_url and added ErrorFrame reporting. - GoogleLLMService `api_key` argument is now mandatory. ### Fixed diff --git a/examples/foundational/07c-interruptible-deepgram.py b/examples/foundational/07c-interruptible-deepgram.py new file mode 100644 index 000000000..30703c07e --- /dev/null +++ b/examples/foundational/07c-interruptible-deepgram.py @@ -0,0 +1,94 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import aiohttp +import os +import sys + +from pipecat.frames.frames import LLMMessagesFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantResponseAggregator, LLMUserResponseAggregator) +from pipecat.services.deepgram import DeepgramTTSService +from pipecat.services.openai import OpenAILLMService +from pipecat.transports.services.daily import DailyParams, DailyTransport +from pipecat.vad.silero import SileroVADAnalyzer + +from runner import configure + +from loguru import logger + +from dotenv import load_dotenv +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(room_url: str, token): + async with aiohttp.ClientSession() as session: + transport = DailyTransport( + room_url, + token, + "Respond bot", + DailyParams( + audio_out_enabled=True, + transcription_enabled=True, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer() + ) + ) + + tts = DeepgramTTSService( + aiohttp_session=session, + api_key=os.getenv("DEEPGRAM_API_KEY"), + voice="aura-helios-en" + ) + + llm = OpenAILLMService( + api_key=os.getenv("OPENAI_API_KEY"), + model="gpt-4-turbo-preview") + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + tma_in = LLMUserResponseAggregator(messages) + tma_out = LLMAssistantResponseAggregator(messages) + + pipeline = Pipeline([ + transport.input(), # Transport user input + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out # Assistant spoken responses + ]) + + task = PipelineTask(pipeline, allow_interruptions=True) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + transport.capture_participant_transcription(participant["id"]) + # Kick off the conversation. + messages.append( + {"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([LLMMessagesFrame(messages)]) + + runner = PipelineRunner() + + await runner.run(task) + + +if __name__ == "__main__": + (url, token) = configure() + asyncio.run(main(url, token)) diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index 294073b87..74a0ceea3 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -8,7 +8,7 @@ from typing import AsyncGenerator -from pipecat.frames.frames import AudioRawFrame, Frame +from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame from pipecat.services.ai_services import TTSService from loguru import logger @@ -21,7 +21,7 @@ def __init__( *, aiohttp_session: aiohttp.ClientSession, api_key: str, - voice: str = "alpha-asteria-en-v2", + voice: str = "aura-helios-en", **kwargs): super().__init__(**kwargs) @@ -31,11 +31,18 @@ def __init__( async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.info(f"Running Deepgram TTS for {text}") - base_url = "https://api.beta.deepgram.com/v1/speak" + base_url = "https://api.deepgram.com/v1/speak" request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate=16000" headers = {"authorization": f"token {self._api_key}"} body = {"text": text} + async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r: + if r.status != 200: + text = await r.text() + logger.error(f"Error getting audio (status: {r.status}, error: {text})") + yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {text})") + return + async for data in r.content: frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1) yield frame