diff --git a/examples/foundational/07d-interruptible-cartesia.py b/examples/foundational/07d-interruptible-cartesia.py new file mode 100644 index 000000000..d9e5128d5 --- /dev/null +++ b/examples/foundational/07d-interruptible-cartesia.py @@ -0,0 +1,93 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import aiohttp +import os +import sys + +from pipecat.frames.frames import LLMMessagesFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantResponseAggregator, LLMUserResponseAggregator) +from pipecat.services.cartesia import CartesiaTTSService +from pipecat.services.openai import OpenAILLMService +from pipecat.transports.services.daily import DailyParams, DailyTransport +from pipecat.vad.silero import SileroVADAnalyzer + +from runner import configure + +from loguru import logger + +from dotenv import load_dotenv +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(room_url: str, token): + async with aiohttp.ClientSession() as session: + transport = DailyTransport( + room_url, + token, + "Respond bot", + DailyParams( + audio_out_enabled=True, + transcription_enabled=True, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer() + ) + ) + + tts = CartesiaTTSService( + api_key=os.getenv("CARTESIA_API_KEY"), + voice_name="Barbershop Man" + ) + + llm = OpenAILLMService( + api_key=os.getenv("OPENAI_API_KEY"), + model="gpt-4o") + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + tma_in = LLMUserResponseAggregator(messages) + tma_out = LLMAssistantResponseAggregator(messages) + + pipeline = Pipeline([ + transport.input(), # Transport user input + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out # Assistant spoken responses + ]) + + task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True)) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + transport.capture_participant_transcription(participant["id"]) + # Kick off the conversation. + messages.append( + {"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([LLMMessagesFrame(messages)]) + + runner = PipelineRunner() + + await runner.run(task) + + +if __name__ == "__main__": + (url, token) = configure() + asyncio.run(main(url, token)) diff --git a/macos-py3.10-requirements.txt b/macos-py3.10-requirements.txt index 0aaaf7a01..35ddcd8b6 100644 --- a/macos-py3.10-requirements.txt +++ b/macos-py3.10-requirements.txt @@ -5,7 +5,9 @@ # pip-compile --all-extras pyproject.toml # aiohttp==3.9.5 - # via pipecat-ai (pyproject.toml) + # via + # cartesia + # pipecat-ai (pyproject.toml) aiosignal==1.3.1 # via aiohttp annotated-types==0.7.0 @@ -21,7 +23,7 @@ async-timeout==4.0.3 # via aiohttp attrs==23.2.0 # via aiohttp -av==12.0.0 +av==12.1.0 # via faster-whisper azure-cognitiveservices-speech==1.37.0 # via pipecat-ai (pyproject.toml) @@ -29,11 +31,15 @@ blinker==1.8.2 # via flask cachetools==5.3.3 # via google-auth +cartesia==0.1.0 + # via pipecat-ai (pyproject.toml) certifi==2024.2.2 # via # httpcore # httpx # requests +cffi==1.16.0 + # via sounddevice charset-normalizer==3.3.2 # via requests click==8.1.7 @@ -42,7 +48,7 @@ coloredlogs==15.0.1 # via onnxruntime ctranslate2==4.2.1 # via faster-whisper -daily-python==0.9.0 +daily-python==0.9.1 # via pipecat-ai (pyproject.toml) distro==1.9.0 # via @@ -51,7 +57,9 @@ distro==1.9.0 einops==0.8.0 # via pipecat-ai (pyproject.toml) exceptiongroup==1.2.1 - # via anyio + # via + # anyio + # pytest fal-client==0.4.0 # via pipecat-ai (pyproject.toml) faster-whisper==1.0.2 @@ -122,6 +130,7 @@ httplib2==0.22.0 httpx==0.27.0 # via # anthropic + # cartesia # fal-client # openai httpx-sse==0.4.0 @@ -140,6 +149,8 @@ idna==3.7 # httpx # requests # yarl +iniconfig==2.0.0 + # via pytest itsdangerous==2.2.0 # via flask jinja2==3.1.4 @@ -177,11 +188,14 @@ packaging==24.0 # via # huggingface-hub # onnxruntime + # pytest # transformers pillow==10.3.0 # via # pipecat-ai (pyproject.toml) # torchvision +pluggy==1.5.0 + # via pytest proto-plus==1.23.0 # via # google-ai-generativelanguage @@ -204,6 +218,8 @@ pyasn1-modules==0.4.0 # via google-auth pyaudio==0.2.14 # via pipecat-ai (pyproject.toml) +pycparser==2.22 + # via cffi pydantic==2.7.2 # via # anthropic @@ -217,6 +233,10 @@ pyloudnorm==0.1.1 # via pipecat-ai (pyproject.toml) pyparsing==3.1.2 # via httplib2 +pytest==8.2.1 + # via pytest-asyncio +pytest-asyncio==0.23.7 + # via cartesia python-dotenv==1.0.1 # via pipecat-ai (pyproject.toml) pyyaml==6.0.1 @@ -227,8 +247,9 @@ pyyaml==6.0.1 # transformers regex==2024.5.15 # via transformers -requests==2.32.2 +requests==2.32.3 # via + # cartesia # google-api-core # huggingface-hub # pyht @@ -247,7 +268,9 @@ sniffio==1.3.1 # anyio # httpx # openai -sympy==1.12 +sounddevice==0.4.7 + # via pipecat-ai (pyproject.toml) +sympy==1.12.1 # via # onnxruntime # torch @@ -258,6 +281,8 @@ tokenizers==0.19.1 # anthropic # faster-whisper # transformers +tomli==2.0.1 + # via pytest torch==2.3.0 # via # pipecat-ai (pyproject.toml) @@ -292,7 +317,9 @@ uritemplate==4.1.1 urllib3==2.2.1 # via requests websockets==12.0 - # via pipecat-ai (pyproject.toml) + # via + # cartesia + # pipecat-ai (pyproject.toml) werkzeug==3.0.3 # via flask yarl==1.9.4 diff --git a/pyproject.toml b/pyproject.toml index 23245cfdc..f52db355a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ Website = "https://pipecat.ai" [project.optional-dependencies] anthropic = [ "anthropic~=0.25.7" ] azure = [ "azure-cognitiveservices-speech~=1.37.0" ] +cartesia = [ "numpy~=1.26.0", "sounddevice", "cartesia" ] daily = [ "daily-python~=0.9.0" ] examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ] fal = [ "fal-client~=0.4.0" ] diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py new file mode 100644 index 000000000..68c63f11a --- /dev/null +++ b/src/pipecat/services/cartesia.py @@ -0,0 +1,56 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from cartesia.tts import AsyncCartesiaTTS + +import time +from typing import AsyncGenerator + +from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame +from pipecat.services.ai_services import TTSService + +from loguru import logger + + +class CartesiaTTSService(TTSService): + + def __init__( + self, + *, + api_key: str, + voice_name: str, + **kwargs): + super().__init__(**kwargs) + + self._api_key = api_key + self._voice_name = voice_name + + self._client = None + + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: + logger.debug(f"Transcribing text: [{text}]") + + try: + if self._client is None: + self._client = AsyncCartesiaTTS(api_key=self._api_key) + voices = self._client.get_voices() + self._voice_id = voices[self._voice_name]["id"] + self._voice = self._client.get_voice_embedding(voice_id=self._voice_id) + + chunk_generator = await self._client.generate( + transcript=text, voice=self._voice, stream=True, + model_id="upbeat-moon", data_rtype='array', output_format='pcm_16000', + # a chunk_time of 0.1 seems to be the default. there are small audio pops/gaps which + # we need to debug + chunk_time=0.1 + ) + + async for chunk in chunk_generator: + # print(f"") + frame = AudioRawFrame(chunk['audio'], 16000, 1) + yield frame + except Exception as e: + logger.error(f"Exception {e}") diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index 74a0ceea3..fb4eb4d33 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -32,17 +32,21 @@ def __init__( async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.info(f"Running Deepgram TTS for {text}") base_url = "https://api.deepgram.com/v1/speak" - request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate=16000" + request_url = f"{base_url}?model = { + self._voice} & encoding = linear16 & container = none & sample_rate = 16000" headers = {"authorization": f"token {self._api_key}"} body = {"text": text} - async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r: - if r.status != 200: - text = await r.text() - logger.error(f"Error getting audio (status: {r.status}, error: {text})") - yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {text})") - return - - async for data in r.content: - frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1) - yield frame + try: + async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r: + if r.status != 200: + text = await r.text() + logger.error(f"Error getting audio (status: {r.status}, error: {text})") + yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {text})") + return + + async for data in r.content: + frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1) + yield frame + except Exception as e: + logger.error(f"Exception {e}") diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py index 42d5b3eae..6fb937e9f 100644 --- a/src/pipecat/services/elevenlabs.py +++ b/src/pipecat/services/elevenlabs.py @@ -8,7 +8,7 @@ from typing import AsyncGenerator -from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame, TTSStartedFrame, TTSStoppedFrame, TextFrame +from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame from pipecat.services.ai_services import TTSService from loguru import logger diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index f42a9b069..81d5e756e 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -86,9 +86,18 @@ async def _process_context(self, context: OpenAILLMContext): logger.debug(f"Google LLM TTFB: {time.time() - start_time}") async for chunk in self._async_generator_wrapper(response): - await self.push_frame(LLMResponseStartFrame()) - await self.push_frame(TextFrame(chunk.text)) - await self.push_frame(LLMResponseEndFrame()) + try: + text = chunk.text + await self.push_frame(LLMResponseStartFrame()) + await self.push_frame(TextFrame(text)) + await self.push_frame(LLMResponseEndFrame()) + except Exception as e: + # Google LLMs seem to flag safety issues a lot! + if chunk.candidates[0].finish_reason == 3: + logger.debug( + f"LLM refused to generate content for safety reasons - {messages}.") + else: + logger.error(f"Error {e}") except Exception as e: logger.error(f"Exception: {e}")