From fa0deededa5c27cfbc190b207009626c3c00aabc Mon Sep 17 00:00:00 2001 From: Jin Kim Date: Mon, 9 Sep 2024 10:53:23 +0900 Subject: [PATCH] Add voice options and make to use InputParams for Cartesia. --- .../07d-interruptible-cartesia.py | 4 +- .../12c-describe-video-anthropic.py | 4 +- examples/studypal/studypal.py | 4 +- src/pipecat/services/cartesia.py | 45 +++++++++++++------ 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/examples/foundational/07d-interruptible-cartesia.py b/examples/foundational/07d-interruptible-cartesia.py index 6b8bbcc5f..7bcc7476b 100644 --- a/examples/foundational/07d-interruptible-cartesia.py +++ b/examples/foundational/07d-interruptible-cartesia.py @@ -52,7 +52,9 @@ async def main(): tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man - sample_rate=44100, + params=CartesiaTTSService.InputParams( + sample_rate=44100, + ), ) llm = OpenAILLMService( diff --git a/examples/foundational/12c-describe-video-anthropic.py b/examples/foundational/12c-describe-video-anthropic.py index cc1f14c92..8531debf8 100644 --- a/examples/foundational/12c-describe-video-anthropic.py +++ b/examples/foundational/12c-describe-video-anthropic.py @@ -78,7 +78,9 @@ async def main(): tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady - sample_rate=16000, + params=CartesiaTTSService.InputParams( + sample_rate=16000, + ), ) @transport.event_handler("on_first_participant_joined") diff --git a/examples/studypal/studypal.py b/examples/studypal/studypal.py index 8adfe2954..f14bd3def 100644 --- a/examples/studypal/studypal.py +++ b/examples/studypal/studypal.py @@ -124,7 +124,9 @@ async def main(): api_key=os.getenv("CARTESIA_API_KEY"), voice_id=os.getenv("CARTESIA_VOICE_ID", "4d2fd738-3b3d-4368-957a-bb4805275bd9"), # British Narration Lady: 4d2fd738-3b3d-4368-957a-bb4805275bd9 - sample_rate=44100, + params=CartesiaTTSService.InputParams( + sample_rate=44100, + ), ) llm = OpenAILLMService( diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index e3541ccea..927da53f0 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -10,7 +10,8 @@ import asyncio import time -from typing import AsyncGenerator, Mapping +from typing import AsyncGenerator, Optional +from pydantic.main import BaseModel from pipecat.frames.frames import ( CancelFrame, @@ -61,6 +62,14 @@ def language_to_cartesia_language(language: Language) -> str | None: class CartesiaTTSService(TTSService): + class InputParams(BaseModel): + model_id: Optional[str] = "sonic-english" + encoding: Optional[str] = "pcm_s16le" + sample_rate: Optional[int] = 16000 + container: Optional[str] = "raw" + language: Optional[str] = "en" + speed: Optional[str] = None + emotion: Optional[list[str]] = [] def __init__( self, @@ -69,10 +78,7 @@ def __init__( voice_id: str, cartesia_version: str = "2024-06-10", url: str = "wss://api.cartesia.ai/tts/websocket", - model_id: str = "sonic-english", - encoding: str = "pcm_s16le", - sample_rate: int = 16000, - language: str = "en", + params: InputParams = InputParams(), **kwargs): super().__init__(**kwargs) @@ -92,13 +98,15 @@ def __init__( self._cartesia_version = cartesia_version self._url = url self._voice_id = voice_id - self._model_id = model_id + self._model_id = params.model_id self._output_format = { - "container": "raw", - "encoding": encoding, - "sample_rate": sample_rate, + "container": params.container, + "encoding": params.encoding, + "sample_rate": params.sample_rate, } - self._language = language + self._language = params.language + self._speed = params.speed + self._emotion = params.emotion self._websocket = None self._context_id = None @@ -249,15 +257,24 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() self._context_id = str(uuid.uuid4()) + voice_config = { + "mode": "id", + "id": self._voice_id + } + + if self._speed or self._emotion: + voice_config["__experimental_controls"] = {} + if self._speed: + voice_config["__experimental_controls"]["speed"] = self._speed + if self._emotion: + voice_config["__experimental_controls"]["emotion"] = self._emotion + msg = { "transcript": text + " ", "continue": True, "context_id": self._context_id, "model_id": self._model_id, - "voice": { - "mode": "id", - "id": self._voice_id - }, + "voice": voice_config, "output_format": self._output_format, "language": self._language, "add_timestamps": True,