From fa0deededa5c27cfbc190b207009626c3c00aabc Mon Sep 17 00:00:00 2001 From: Jin Kim Date: Mon, 9 Sep 2024 10:53:23 +0900 Subject: [PATCH 1/5] Add voice options and make to use InputParams for Cartesia. --- .../07d-interruptible-cartesia.py | 4 +- .../12c-describe-video-anthropic.py | 4 +- examples/studypal/studypal.py | 4 +- src/pipecat/services/cartesia.py | 45 +++++++++++++------ 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/examples/foundational/07d-interruptible-cartesia.py b/examples/foundational/07d-interruptible-cartesia.py index 6b8bbcc5f..7bcc7476b 100644 --- a/examples/foundational/07d-interruptible-cartesia.py +++ b/examples/foundational/07d-interruptible-cartesia.py @@ -52,7 +52,9 @@ async def main(): tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man - sample_rate=44100, + params=CartesiaTTSService.InputParams( + sample_rate=44100, + ), ) llm = OpenAILLMService( diff --git a/examples/foundational/12c-describe-video-anthropic.py b/examples/foundational/12c-describe-video-anthropic.py index cc1f14c92..8531debf8 100644 --- a/examples/foundational/12c-describe-video-anthropic.py +++ b/examples/foundational/12c-describe-video-anthropic.py @@ -78,7 +78,9 @@ async def main(): tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady - sample_rate=16000, + params=CartesiaTTSService.InputParams( + sample_rate=16000, + ), ) @transport.event_handler("on_first_participant_joined") diff --git a/examples/studypal/studypal.py b/examples/studypal/studypal.py index 8adfe2954..f14bd3def 100644 --- a/examples/studypal/studypal.py +++ b/examples/studypal/studypal.py @@ -124,7 +124,9 @@ async def main(): api_key=os.getenv("CARTESIA_API_KEY"), voice_id=os.getenv("CARTESIA_VOICE_ID", "4d2fd738-3b3d-4368-957a-bb4805275bd9"), # British Narration Lady: 4d2fd738-3b3d-4368-957a-bb4805275bd9 - sample_rate=44100, + params=CartesiaTTSService.InputParams( + sample_rate=44100, + ), ) llm = OpenAILLMService( diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index e3541ccea..927da53f0 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -10,7 +10,8 @@ import asyncio import time -from typing import AsyncGenerator, Mapping +from typing import AsyncGenerator, Optional +from pydantic.main import BaseModel from pipecat.frames.frames import ( CancelFrame, @@ -61,6 +62,14 @@ def language_to_cartesia_language(language: Language) -> str | None: class CartesiaTTSService(TTSService): + class InputParams(BaseModel): + model_id: Optional[str] = "sonic-english" + encoding: Optional[str] = "pcm_s16le" + sample_rate: Optional[int] = 16000 + container: Optional[str] = "raw" + language: Optional[str] = "en" + speed: Optional[str] = None + emotion: Optional[list[str]] = [] def __init__( self, @@ -69,10 +78,7 @@ def __init__( voice_id: str, cartesia_version: str = "2024-06-10", url: str = "wss://api.cartesia.ai/tts/websocket", - model_id: str = "sonic-english", - encoding: str = "pcm_s16le", - sample_rate: int = 16000, - language: str = "en", + params: InputParams = InputParams(), **kwargs): super().__init__(**kwargs) @@ -92,13 +98,15 @@ def __init__( self._cartesia_version = cartesia_version self._url = url self._voice_id = voice_id - self._model_id = model_id + self._model_id = params.model_id self._output_format = { - "container": "raw", - "encoding": encoding, - "sample_rate": sample_rate, + "container": params.container, + "encoding": params.encoding, + "sample_rate": params.sample_rate, } - self._language = language + self._language = params.language + self._speed = params.speed + self._emotion = params.emotion self._websocket = None self._context_id = None @@ -249,15 +257,24 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() self._context_id = str(uuid.uuid4()) + voice_config = { + "mode": "id", + "id": self._voice_id + } + + if self._speed or self._emotion: + voice_config["__experimental_controls"] = {} + if self._speed: + voice_config["__experimental_controls"]["speed"] = self._speed + if self._emotion: + voice_config["__experimental_controls"]["emotion"] = self._emotion + msg = { "transcript": text + " ", "continue": True, "context_id": self._context_id, "model_id": self._model_id, - "voice": { - "mode": "id", - "id": self._voice_id - }, + "voice": voice_config, "output_format": self._output_format, "language": self._language, "add_timestamps": True, From 2da0ecbe3c7326d180125d16f32ee8f95113f847 Mon Sep 17 00:00:00 2001 From: Jin Kim Date: Wed, 18 Sep 2024 00:38:12 +0900 Subject: [PATCH 2/5] Revert "model_id" as a main argument --- src/pipecat/services/cartesia.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 25f54bf11..45e42470e 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -63,7 +63,6 @@ def language_to_cartesia_language(language: Language) -> str | None: class CartesiaTTSService(AsyncWordTTSService): class InputParams(BaseModel): - model_id: Optional[str] = "sonic-english" encoding: Optional[str] = "pcm_s16le" sample_rate: Optional[int] = 16000 container: Optional[str] = "raw" @@ -78,6 +77,7 @@ def __init__( voice_id: str, cartesia_version: str = "2024-06-10", url: str = "wss://api.cartesia.ai/tts/websocket", + model_id: str = "sonic-english", params: InputParams = InputParams(), **kwargs): # Aggregating sentences still gives cleaner-sounding results and fewer @@ -96,7 +96,7 @@ def __init__( self._cartesia_version = cartesia_version self._url = url self._voice_id = voice_id - self._model_id = params.model_id + self._model_id = model_id self._output_format = { "container": params.container, "encoding": params.encoding, From 75008d8f115ccab562a9b564e7415bdcb024a850 Mon Sep 17 00:00:00 2001 From: Jin Kim Date: Wed, 18 Sep 2024 00:51:45 +0900 Subject: [PATCH 3/5] Add speed and emotion setting method to Cartesia TTS service --- src/pipecat/services/cartesia.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 45e42470e..6cec9c06d 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -121,6 +121,14 @@ async def set_voice(self, voice: str): logger.debug(f"Switching TTS voice to: [{voice}]") self._voice_id = voice + async def set_speed(self, speed: str): + logger.debug(f"Switching TTS speed to: [{speed}]") + self._speed = speed + + async def set_emotion(self, emotion: list[str]): + logger.debug(f"Switching TTS emotion to: [{emotion}]") + self._emotion = emotion + async def set_language(self, language: Language): logger.debug(f"Switching TTS language to: [{language}]") self._language = language_to_cartesia_language(language) From 49f212389305465791eab139083b2e3286a05dee Mon Sep 17 00:00:00 2001 From: Jin Kim Date: Tue, 24 Sep 2024 07:59:26 +0900 Subject: [PATCH 4/5] Apply and Fix upstream changes for Cartesia --- src/pipecat/services/cartesia.py | 113 +++++++++++++++++++------------ 1 file changed, 69 insertions(+), 44 deletions(-) diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 40475343c..f08c06dea 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -9,7 +9,7 @@ import base64 import asyncio -from typing import AsyncGenerator, Optional +from typing import AsyncGenerator, Optional, Union, List from pydantic.main import BaseModel from pipecat.frames.frames import ( @@ -67,8 +67,8 @@ class InputParams(BaseModel): sample_rate: Optional[int] = 16000 container: Optional[str] = "raw" language: Optional[str] = "en" - speed: Optional[str] = None - emotion: Optional[list[str]] = [] + speed: Optional[Union[str, float]] = "" + emotion: Optional[List[str]] = [] def __init__( self, @@ -91,13 +91,14 @@ def __init__( # can use those to generate text frames ourselves aligned with the # playout timing of the audio! super().__init__( - aggregate_sentences=True, push_text_frames=False, sample_rate=sample_rate, **kwargs + aggregate_sentences=True, push_text_frames=False, sample_rate=params.sample_rate, **kwargs ) self._api_key = api_key self._cartesia_version = cartesia_version self._url = url self._voice_id = voice_id + self._model_id = model_id self.set_model_name(model_id) self._output_format = { "container": params.container, @@ -116,6 +117,7 @@ def can_generate_metrics(self) -> bool: return True async def set_model(self, model: str): + self._model_id = model await super().set_model(model) logger.debug(f"Switching TTS model to: [{model}]") @@ -135,6 +137,31 @@ async def set_language(self, language: Language): logger.debug(f"Switching TTS language to: [{language}]") self._language = language_to_cartesia_language(language) + def _build_msg(self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True): + voice_config = { + "mode": "id", + "id": self._voice_id + } + + if self._speed or self._emotion: + voice_config["__experimental_controls"] = {} + if self._speed: + voice_config["__experimental_controls"]["speed"] = self._speed + if self._emotion: + voice_config["__experimental_controls"]["emotion"] = self._emotion + + msg = { + "transcript": text, + "continue": continue_transcript, + "context_id": self._context_id, + "model_id": self._model_name, + "voice": voice_config, + "output_format": self._output_format, + "language": self._language, + "add_timestamps": add_timestamps, + } + return json.dumps(msg) + async def start(self, frame: StartFrame): await super().start(frame) await self._connect() @@ -190,17 +217,8 @@ async def flush_audio(self): if not self._context_id or not self._websocket: return logger.trace("Flushing audio") - msg = { - "transcript": "", - "continue": False, - "context_id": self._context_id, - "model_id": self.model_name, - "voice": {"mode": "id", "id": self._voice_id}, - "output_format": self._output_format, - "language": self._language, - "add_timestamps": True, - } - await self._websocket.send(json.dumps(msg)) + msg = self._build_msg(text="", continue_transcript=False) + await self._websocket.send(msg) async def _receive_task_handler(self): try: @@ -255,30 +273,10 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() self._context_id = str(uuid.uuid4()) - voice_config = { - "mode": "id", - "id": self._voice_id - } + msg = self._build_msg(text=text) - if self._speed or self._emotion: - voice_config["__experimental_controls"] = {} - if self._speed: - voice_config["__experimental_controls"]["speed"] = self._speed - if self._emotion: - voice_config["__experimental_controls"]["emotion"] = self._emotion - - msg = { - "transcript": text + " ", - "continue": True, - "context_id": self._context_id, - "model_id": self._model_id, - "voice": voice_config, - "output_format": self._output_format, - "language": self._language, - "add_timestamps": True, - } try: - await self._get_websocket().send(json.dumps(msg)) + await self._get_websocket().send(msg) await self.start_tts_usage_metrics(text) except Exception as e: logger.error(f"{self} error sending message: {e}") @@ -292,6 +290,14 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: class CartesiaHttpTTSService(TTSService): + class InputParams(BaseModel): + encoding: Optional[str] = "pcm_s16le" + sample_rate: Optional[int] = 16000 + container: Optional[str] = "raw" + language: Optional[str] = "en" + speed: Optional[Union[str, float]] = "" + emotion: Optional[List[str]] = [] + def __init__( self, *, @@ -299,9 +305,7 @@ def __init__( voice_id: str, model_id: str = "sonic-english", base_url: str = "https://api.cartesia.ai", - encoding: str = "pcm_s16le", - sample_rate: int = 16000, - language: str = "en", + params: InputParams = InputParams(), **kwargs, ): super().__init__(**kwargs) @@ -309,12 +313,15 @@ def __init__( self._api_key = api_key self._voice_id = voice_id self._model_id = model_id + self.set_model_name(model_id) self._output_format = { - "container": "raw", - "encoding": encoding, - "sample_rate": sample_rate, + "container": params.container, + "encoding": params.encoding, + "sample_rate": params.sample_rate, } - self._language = language + self._language = params.language + self._speed = params.speed + self._emotion = params.emotion self._client = AsyncCartesia(api_key=api_key, base_url=base_url) @@ -324,11 +331,20 @@ def can_generate_metrics(self) -> bool: async def set_model(self, model: str): logger.debug(f"Switching TTS model to: [{model}]") self._model_id = model + await super().set_model(model) async def set_voice(self, voice: str): logger.debug(f"Switching TTS voice to: [{voice}]") self._voice_id = voice + async def set_speed(self, speed: str): + logger.debug(f"Switching TTS speed to: [{speed}]") + self._speed = speed + + async def set_emotion(self, emotion: list[str]): + logger.debug(f"Switching TTS emotion to: [{emotion}]") + self._emotion = emotion + async def set_language(self, language: Language): logger.debug(f"Switching TTS language to: [{language}]") self._language = language_to_cartesia_language(language) @@ -348,6 +364,14 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() try: + voice_controls = None + if self._speed or self._emotion: + voice_controls = {} + if self._speed: + voice_controls["speed"] = self._speed + if self._emotion: + voice_controls["emotion"] = self._emotion + output = await self._client.tts.sse( model_id=self._model_id, transcript=text, @@ -355,6 +379,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: output_format=self._output_format, language=self._language, stream=False, + _experimental_voice_controls=voice_controls ) await self.stop_ttfb_metrics() From d05717a1bd709ecbf475bb2ff410763bcd8783cd Mon Sep 17 00:00:00 2001 From: Jin Kim Date: Thu, 26 Sep 2024 19:52:25 +0900 Subject: [PATCH 5/5] Apply Ruff formater --- src/pipecat/services/cartesia.py | 41 ++++++++++++++++---------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index f08c06dea..c1c296046 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -71,15 +71,16 @@ class InputParams(BaseModel): emotion: Optional[List[str]] = [] def __init__( - self, - *, - api_key: str, - voice_id: str, - cartesia_version: str = "2024-06-10", - url: str = "wss://api.cartesia.ai/tts/websocket", - model_id: str = "sonic-english", - params: InputParams = InputParams(), - **kwargs): + self, + *, + api_key: str, + voice_id: str, + cartesia_version: str = "2024-06-10", + url: str = "wss://api.cartesia.ai/tts/websocket", + model_id: str = "sonic-english", + params: InputParams = InputParams(), + **kwargs, + ): # Aggregating sentences still gives cleaner-sounding results and fewer # artifacts than streaming one word at a time. On average, waiting for a # full sentence should only "cost" us 15ms or so with GPT-4o or a Llama @@ -91,7 +92,10 @@ def __init__( # can use those to generate text frames ourselves aligned with the # playout timing of the audio! super().__init__( - aggregate_sentences=True, push_text_frames=False, sample_rate=params.sample_rate, **kwargs + aggregate_sentences=True, + push_text_frames=False, + sample_rate=params.sample_rate, + **kwargs, ) self._api_key = api_key @@ -137,11 +141,10 @@ async def set_language(self, language: Language): logger.debug(f"Switching TTS language to: [{language}]") self._language = language_to_cartesia_language(language) - def _build_msg(self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True): - voice_config = { - "mode": "id", - "id": self._voice_id - } + def _build_msg( + self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True + ): + voice_config = {"mode": "id", "id": self._voice_id} if self._speed or self._emotion: voice_config["__experimental_controls"] = {} @@ -236,8 +239,7 @@ async def _receive_task_handler(self): await self.add_word_timestamps([("LLMFullResponseEndFrame", 0)]) elif msg["type"] == "timestamps": await self.add_word_timestamps( - list(zip(msg["word_timestamps"]["words"], - msg["word_timestamps"]["start"])) + list(zip(msg["word_timestamps"]["words"], msg["word_timestamps"]["start"])) ) elif msg["type"] == "chunk": await self.stop_ttfb_metrics() @@ -254,8 +256,7 @@ async def _receive_task_handler(self): await self.stop_all_metrics() await self.push_error(ErrorFrame(f'{self} error: {msg["error"]}')) else: - logger.error( - f"Cartesia error, unknown message type: {msg}") + logger.error(f"Cartesia error, unknown message type: {msg}") except asyncio.CancelledError: pass except Exception as e: @@ -379,7 +380,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: output_format=self._output_format, language=self._language, stream=False, - _experimental_voice_controls=voice_controls + _experimental_voice_controls=voice_controls, ) await self.stop_ttfb_metrics()