Skip to content

Commit

Permalink
cartesia tts support
Browse files Browse the repository at this point in the history
  • Loading branch information
kwindla committed May 30, 2024
1 parent c444004 commit 9207453
Show file tree
Hide file tree
Showing 7 changed files with 212 additions and 21 deletions.
93 changes: 93 additions & 0 deletions examples/foundational/07d-interruptible-cartesia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

import asyncio
import aiohttp
import os
import sys

from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer

from runner import configure

from loguru import logger

from dotenv import load_dotenv
load_dotenv(override=True)

logger.remove(0)
logger.add(sys.stderr, level="DEBUG")


async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
)

tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="Barbershop Man"
)

llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")

messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]

tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)

pipeline = Pipeline([
transport.input(), # Transport user input
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])

task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))

@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])

runner = PipelineRunner()

await runner.run(task)


if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
39 changes: 33 additions & 6 deletions macos-py3.10-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
# pip-compile --all-extras pyproject.toml
#
aiohttp==3.9.5
# via pipecat-ai (pyproject.toml)
# via
# cartesia
# pipecat-ai (pyproject.toml)
aiosignal==1.3.1
# via aiohttp
annotated-types==0.7.0
Expand All @@ -29,11 +31,15 @@ blinker==1.8.2
# via flask
cachetools==5.3.3
# via google-auth
cartesia==0.1.0
# via pipecat-ai (pyproject.toml)
certifi==2024.2.2
# via
# httpcore
# httpx
# requests
cffi==1.16.0
# via sounddevice
charset-normalizer==3.3.2
# via requests
click==8.1.7
Expand All @@ -42,7 +48,7 @@ coloredlogs==15.0.1
# via onnxruntime
ctranslate2==4.2.1
# via faster-whisper
daily-python==0.9.0
daily-python==0.9.1
# via pipecat-ai (pyproject.toml)
distro==1.9.0
# via
Expand All @@ -51,7 +57,9 @@ distro==1.9.0
einops==0.8.0
# via pipecat-ai (pyproject.toml)
exceptiongroup==1.2.1
# via anyio
# via
# anyio
# pytest
fal-client==0.4.0
# via pipecat-ai (pyproject.toml)
faster-whisper==1.0.2
Expand Down Expand Up @@ -122,6 +130,7 @@ httplib2==0.22.0
httpx==0.27.0
# via
# anthropic
# cartesia
# fal-client
# openai
httpx-sse==0.4.0
Expand All @@ -140,6 +149,8 @@ idna==3.7
# httpx
# requests
# yarl
iniconfig==2.0.0
# via pytest
itsdangerous==2.2.0
# via flask
jinja2==3.1.4
Expand Down Expand Up @@ -177,11 +188,14 @@ packaging==24.0
# via
# huggingface-hub
# onnxruntime
# pytest
# transformers
pillow==10.3.0
# via
# pipecat-ai (pyproject.toml)
# torchvision
pluggy==1.5.0
# via pytest
proto-plus==1.23.0
# via
# google-ai-generativelanguage
Expand All @@ -204,6 +218,8 @@ pyasn1-modules==0.4.0
# via google-auth
pyaudio==0.2.14
# via pipecat-ai (pyproject.toml)
pycparser==2.22
# via cffi
pydantic==2.7.2
# via
# anthropic
Expand All @@ -217,6 +233,10 @@ pyloudnorm==0.1.1
# via pipecat-ai (pyproject.toml)
pyparsing==3.1.2
# via httplib2
pytest==8.2.1
# via pytest-asyncio
pytest-asyncio==0.23.7
# via cartesia
python-dotenv==1.0.1
# via pipecat-ai (pyproject.toml)
pyyaml==6.0.1
Expand All @@ -227,8 +247,9 @@ pyyaml==6.0.1
# transformers
regex==2024.5.15
# via transformers
requests==2.32.2
requests==2.32.3
# via
# cartesia
# google-api-core
# huggingface-hub
# pyht
Expand All @@ -247,7 +268,9 @@ sniffio==1.3.1
# anyio
# httpx
# openai
sympy==1.12
sounddevice==0.4.7
# via pipecat-ai (pyproject.toml)
sympy==1.12.1
# via
# onnxruntime
# torch
Expand All @@ -258,6 +281,8 @@ tokenizers==0.19.1
# anthropic
# faster-whisper
# transformers
tomli==2.0.1
# via pytest
torch==2.3.0
# via
# pipecat-ai (pyproject.toml)
Expand Down Expand Up @@ -292,7 +317,9 @@ uritemplate==4.1.1
urllib3==2.2.1
# via requests
websockets==12.0
# via pipecat-ai (pyproject.toml)
# via
# cartesia
# pipecat-ai (pyproject.toml)
werkzeug==3.0.3
# via flask
yarl==1.9.4
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Website = "https://pipecat.ai"
[project.optional-dependencies]
anthropic = [ "anthropic~=0.25.7" ]
azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
cartesia = [ "numpy", "sounddevice", "cartesia" ]
daily = [ "daily-python~=0.9.0" ]
examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]
fal = [ "fal-client~=0.4.0" ]
Expand Down
56 changes: 56 additions & 0 deletions src/pipecat/services/cartesia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

from cartesia.tts import AsyncCartesiaTTS

import time
from typing import AsyncGenerator

from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame
from pipecat.services.ai_services import TTSService

from loguru import logger


class CartesiaTTSService(TTSService):

def __init__(
self,
*,
api_key: str,
voice_name: str,
**kwargs):
super().__init__(**kwargs)

self._api_key = api_key
self._voice_name = voice_name

self._client = None

async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Transcribing text: [{text}]")

try:
if self._client is None:
self._client = AsyncCartesiaTTS(api_key=self._api_key)
voices = self._client.get_voices()
self._voice_id = voices[self._voice_name]["id"]
self._voice = self._client.get_voice_embedding(voice_id=self._voice_id)

chunk_generator = await self._client.generate(
transcript=text, voice=self._voice, stream=True,
model_id="upbeat-moon", data_rtype='array', output_format='pcm_16000',
# a chunk_time of 0.1 seems to be the default. there are small audio pops/gaps which
# we need to debug
chunk_time=0.1
)

async for chunk in chunk_generator:
# print(f"")
frame = AudioRawFrame(chunk['audio'], 16000, 1)
yield frame
except Exception as e:
logger.error(f"Exception {e}")
27 changes: 16 additions & 11 deletions src/pipecat/services/deepgram.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#

import aiohttp
import json

from typing import AsyncGenerator

Expand Down Expand Up @@ -32,17 +33,21 @@ def __init__(
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.info(f"Running Deepgram TTS for {text}")
base_url = "https://api.deepgram.com/v1/speak"
request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate=16000"
request_url = f"{base_url}?model={
self._voice}&encoding=linear16&container=none&sample_rate=16000"
headers = {"authorization": f"token {self._api_key}"}
body = {"text": text}

async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
if r.status != 200:
text = await r.text()
logger.error(f"Error getting audio (status: {r.status}, error: {text})")
yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {text})")
return

async for data in r.content:
frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1)
yield frame
try:
async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
if r.status != 200:
text = await r.text()
logger.error(f"Error getting audio (status: {r.status}, error: {text})")
yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {text})")
return

async for data in r.content:
frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1)
yield frame
except Exception as e:
logger.error(f"Exception {e}")
2 changes: 1 addition & 1 deletion src/pipecat/services/elevenlabs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from typing import AsyncGenerator

from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame, TTSStartedFrame, TTSStoppedFrame, TextFrame
from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame
from pipecat.services.ai_services import TTSService

from loguru import logger
Expand Down
15 changes: 12 additions & 3 deletions src/pipecat/services/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,18 @@ async def _process_context(self, context: OpenAILLMContext):
logger.debug(f"Google LLM TTFB: {time.time() - start_time}")

async for chunk in self._async_generator_wrapper(response):
await self.push_frame(LLMResponseStartFrame())
await self.push_frame(TextFrame(chunk.text))
await self.push_frame(LLMResponseEndFrame())
try:
text = chunk.text
await self.push_frame(LLMResponseStartFrame())
await self.push_frame(TextFrame(text))
await self.push_frame(LLMResponseEndFrame())
except Exception as e:
# Google LLMs seem to flag safety issues a lot!
if chunk.candidates[0].finish_reason == 3:
logger.debug(
f"LLM refused to generate content for safety reasons - {messages}.")
else:
logger.error(f"Error {e}")

except Exception as e:
logger.error(f"Exception: {e}")
Expand Down

0 comments on commit 9207453

Please sign in to comment.