From fa0deededa5c27cfbc190b207009626c3c00aabc Mon Sep 17 00:00:00 2001
From: Jin Kim <golbin@gmail.com>
Date: Mon, 9 Sep 2024 10:53:23 +0900
Subject: [PATCH 1/5] Add voice options and make to use InputParams for
 Cartesia.

---
 .../07d-interruptible-cartesia.py             |  4 +-
 .../12c-describe-video-anthropic.py           |  4 +-
 examples/studypal/studypal.py                 |  4 +-
 src/pipecat/services/cartesia.py              | 45 +++++++++++++------
 4 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/examples/foundational/07d-interruptible-cartesia.py b/examples/foundational/07d-interruptible-cartesia.py
index 6b8bbcc5f..7bcc7476b 100644
--- a/examples/foundational/07d-interruptible-cartesia.py
+++ b/examples/foundational/07d-interruptible-cartesia.py
@@ -52,7 +52,9 @@ async def main():
         tts = CartesiaTTSService(
             api_key=os.getenv("CARTESIA_API_KEY"),
             voice_id="a0e99841-438c-4a64-b679-ae501e7d6091",  # Barbershop Man
-            sample_rate=44100,
+            params=CartesiaTTSService.InputParams(
+                sample_rate=44100,
+            ),
         )
 
         llm = OpenAILLMService(
diff --git a/examples/foundational/12c-describe-video-anthropic.py b/examples/foundational/12c-describe-video-anthropic.py
index cc1f14c92..8531debf8 100644
--- a/examples/foundational/12c-describe-video-anthropic.py
+++ b/examples/foundational/12c-describe-video-anthropic.py
@@ -78,7 +78,9 @@ async def main():
         tts = CartesiaTTSService(
             api_key=os.getenv("CARTESIA_API_KEY"),
             voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
-            sample_rate=16000,
+            params=CartesiaTTSService.InputParams(
+                sample_rate=16000,
+            ),
         )
 
         @transport.event_handler("on_first_participant_joined")
diff --git a/examples/studypal/studypal.py b/examples/studypal/studypal.py
index 8adfe2954..f14bd3def 100644
--- a/examples/studypal/studypal.py
+++ b/examples/studypal/studypal.py
@@ -124,7 +124,9 @@ async def main():
             api_key=os.getenv("CARTESIA_API_KEY"),
             voice_id=os.getenv("CARTESIA_VOICE_ID", "4d2fd738-3b3d-4368-957a-bb4805275bd9"),
             # British Narration Lady: 4d2fd738-3b3d-4368-957a-bb4805275bd9
-            sample_rate=44100,
+            params=CartesiaTTSService.InputParams(
+                sample_rate=44100,
+            ),
         )
 
         llm = OpenAILLMService(
diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index e3541ccea..927da53f0 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -10,7 +10,8 @@
 import asyncio
 import time
 
-from typing import AsyncGenerator, Mapping
+from typing import AsyncGenerator, Optional
+from pydantic.main import BaseModel
 
 from pipecat.frames.frames import (
     CancelFrame,
@@ -61,6 +62,14 @@ def language_to_cartesia_language(language: Language) -> str | None:
 
 
 class CartesiaTTSService(TTSService):
+    class InputParams(BaseModel):
+        model_id: Optional[str] = "sonic-english"
+        encoding: Optional[str] = "pcm_s16le"
+        sample_rate: Optional[int] = 16000
+        container: Optional[str] = "raw"
+        language: Optional[str] = "en"
+        speed: Optional[str] = None
+        emotion: Optional[list[str]] = []
 
     def __init__(
             self,
@@ -69,10 +78,7 @@ def __init__(
             voice_id: str,
             cartesia_version: str = "2024-06-10",
             url: str = "wss://api.cartesia.ai/tts/websocket",
-            model_id: str = "sonic-english",
-            encoding: str = "pcm_s16le",
-            sample_rate: int = 16000,
-            language: str = "en",
+            params: InputParams = InputParams(),
             **kwargs):
         super().__init__(**kwargs)
 
@@ -92,13 +98,15 @@ def __init__(
         self._cartesia_version = cartesia_version
         self._url = url
         self._voice_id = voice_id
-        self._model_id = model_id
+        self._model_id = params.model_id
         self._output_format = {
-            "container": "raw",
-            "encoding": encoding,
-            "sample_rate": sample_rate,
+            "container": params.container,
+            "encoding": params.encoding,
+            "sample_rate": params.sample_rate,
         }
-        self._language = language
+        self._language = params.language
+        self._speed = params.speed
+        self._emotion = params.emotion
 
         self._websocket = None
         self._context_id = None
@@ -249,15 +257,24 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 await self.start_ttfb_metrics()
                 self._context_id = str(uuid.uuid4())
 
+            voice_config = {
+                "mode": "id",
+                "id": self._voice_id
+            }
+
+            if self._speed or self._emotion:
+                voice_config["__experimental_controls"] = {}
+                if self._speed:
+                    voice_config["__experimental_controls"]["speed"] = self._speed
+                if self._emotion:
+                    voice_config["__experimental_controls"]["emotion"] = self._emotion
+
             msg = {
                 "transcript": text + " ",
                 "continue": True,
                 "context_id": self._context_id,
                 "model_id": self._model_id,
-                "voice": {
-                    "mode": "id",
-                    "id": self._voice_id
-                },
+                "voice": voice_config,
                 "output_format": self._output_format,
                 "language": self._language,
                 "add_timestamps": True,

From 2da0ecbe3c7326d180125d16f32ee8f95113f847 Mon Sep 17 00:00:00 2001
From: Jin Kim <golbin@gmail.com>
Date: Wed, 18 Sep 2024 00:38:12 +0900
Subject: [PATCH 2/5] Revert "model_id" as a main argument

---
 src/pipecat/services/cartesia.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index 25f54bf11..45e42470e 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -63,7 +63,6 @@ def language_to_cartesia_language(language: Language) -> str | None:
 
 class CartesiaTTSService(AsyncWordTTSService):
     class InputParams(BaseModel):
-        model_id: Optional[str] = "sonic-english"
         encoding: Optional[str] = "pcm_s16le"
         sample_rate: Optional[int] = 16000
         container: Optional[str] = "raw"
@@ -78,6 +77,7 @@ def __init__(
             voice_id: str,
             cartesia_version: str = "2024-06-10",
             url: str = "wss://api.cartesia.ai/tts/websocket",
+            model_id: str = "sonic-english",
             params: InputParams = InputParams(),
             **kwargs):
         # Aggregating sentences still gives cleaner-sounding results and fewer
@@ -96,7 +96,7 @@ def __init__(
         self._cartesia_version = cartesia_version
         self._url = url
         self._voice_id = voice_id
-        self._model_id = params.model_id
+        self._model_id = model_id
         self._output_format = {
             "container": params.container,
             "encoding": params.encoding,

From 75008d8f115ccab562a9b564e7415bdcb024a850 Mon Sep 17 00:00:00 2001
From: Jin Kim <golbin@gmail.com>
Date: Wed, 18 Sep 2024 00:51:45 +0900
Subject: [PATCH 3/5] Add speed and emotion setting method to Cartesia TTS
 service

---
 src/pipecat/services/cartesia.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index 45e42470e..6cec9c06d 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -121,6 +121,14 @@ async def set_voice(self, voice: str):
         logger.debug(f"Switching TTS voice to: [{voice}]")
         self._voice_id = voice
 
+    async def set_speed(self, speed: str):
+        logger.debug(f"Switching TTS speed to: [{speed}]")
+        self._speed = speed
+
+    async def set_emotion(self, emotion: list[str]):
+        logger.debug(f"Switching TTS emotion to: [{emotion}]")
+        self._emotion = emotion
+
     async def set_language(self, language: Language):
         logger.debug(f"Switching TTS language to: [{language}]")
         self._language = language_to_cartesia_language(language)

From 49f212389305465791eab139083b2e3286a05dee Mon Sep 17 00:00:00 2001
From: Jin Kim <golbin@gmail.com>
Date: Tue, 24 Sep 2024 07:59:26 +0900
Subject: [PATCH 4/5] Apply and Fix upstream changes for Cartesia

---
 src/pipecat/services/cartesia.py | 113 +++++++++++++++++++------------
 1 file changed, 69 insertions(+), 44 deletions(-)

diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index 40475343c..f08c06dea 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -9,7 +9,7 @@
 import base64
 import asyncio
 
-from typing import AsyncGenerator, Optional
+from typing import AsyncGenerator, Optional, Union, List
 from pydantic.main import BaseModel
 
 from pipecat.frames.frames import (
@@ -67,8 +67,8 @@ class InputParams(BaseModel):
         sample_rate: Optional[int] = 16000
         container: Optional[str] = "raw"
         language: Optional[str] = "en"
-        speed: Optional[str] = None
-        emotion: Optional[list[str]] = []
+        speed: Optional[Union[str, float]] = ""
+        emotion: Optional[List[str]] = []
 
     def __init__(
             self,
@@ -91,13 +91,14 @@ def __init__(
         # can use those to generate text frames ourselves aligned with the
         # playout timing of the audio!
         super().__init__(
-            aggregate_sentences=True, push_text_frames=False, sample_rate=sample_rate, **kwargs
+            aggregate_sentences=True, push_text_frames=False, sample_rate=params.sample_rate, **kwargs
         )
 
         self._api_key = api_key
         self._cartesia_version = cartesia_version
         self._url = url
         self._voice_id = voice_id
+        self._model_id = model_id
         self.set_model_name(model_id)
         self._output_format = {
             "container": params.container,
@@ -116,6 +117,7 @@ def can_generate_metrics(self) -> bool:
         return True
 
     async def set_model(self, model: str):
+        self._model_id = model
         await super().set_model(model)
         logger.debug(f"Switching TTS model to: [{model}]")
 
@@ -135,6 +137,31 @@ async def set_language(self, language: Language):
         logger.debug(f"Switching TTS language to: [{language}]")
         self._language = language_to_cartesia_language(language)
 
+    def _build_msg(self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True):
+        voice_config = {
+            "mode": "id",
+            "id": self._voice_id
+        }
+
+        if self._speed or self._emotion:
+            voice_config["__experimental_controls"] = {}
+            if self._speed:
+                voice_config["__experimental_controls"]["speed"] = self._speed
+            if self._emotion:
+                voice_config["__experimental_controls"]["emotion"] = self._emotion
+
+        msg = {
+            "transcript": text,
+            "continue": continue_transcript,
+            "context_id": self._context_id,
+            "model_id": self._model_name,
+            "voice": voice_config,
+            "output_format": self._output_format,
+            "language": self._language,
+            "add_timestamps": add_timestamps,
+        }
+        return json.dumps(msg)
+
     async def start(self, frame: StartFrame):
         await super().start(frame)
         await self._connect()
@@ -190,17 +217,8 @@ async def flush_audio(self):
         if not self._context_id or not self._websocket:
             return
         logger.trace("Flushing audio")
-        msg = {
-            "transcript": "",
-            "continue": False,
-            "context_id": self._context_id,
-            "model_id": self.model_name,
-            "voice": {"mode": "id", "id": self._voice_id},
-            "output_format": self._output_format,
-            "language": self._language,
-            "add_timestamps": True,
-        }
-        await self._websocket.send(json.dumps(msg))
+        msg = self._build_msg(text="", continue_transcript=False)
+        await self._websocket.send(msg)
 
     async def _receive_task_handler(self):
         try:
@@ -255,30 +273,10 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 await self.start_ttfb_metrics()
                 self._context_id = str(uuid.uuid4())
 
-            voice_config = {
-                "mode": "id",
-                "id": self._voice_id
-            }
+            msg = self._build_msg(text=text)
 
-            if self._speed or self._emotion:
-                voice_config["__experimental_controls"] = {}
-                if self._speed:
-                    voice_config["__experimental_controls"]["speed"] = self._speed
-                if self._emotion:
-                    voice_config["__experimental_controls"]["emotion"] = self._emotion
-
-            msg = {
-                "transcript": text + " ",
-                "continue": True,
-                "context_id": self._context_id,
-                "model_id": self._model_id,
-                "voice": voice_config,
-                "output_format": self._output_format,
-                "language": self._language,
-                "add_timestamps": True,
-            }
             try:
-                await self._get_websocket().send(json.dumps(msg))
+                await self._get_websocket().send(msg)
                 await self.start_tts_usage_metrics(text)
             except Exception as e:
                 logger.error(f"{self} error sending message: {e}")
@@ -292,6 +290,14 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
 
 
 class CartesiaHttpTTSService(TTSService):
+    class InputParams(BaseModel):
+        encoding: Optional[str] = "pcm_s16le"
+        sample_rate: Optional[int] = 16000
+        container: Optional[str] = "raw"
+        language: Optional[str] = "en"
+        speed: Optional[Union[str, float]] = ""
+        emotion: Optional[List[str]] = []
+
     def __init__(
         self,
         *,
@@ -299,9 +305,7 @@ def __init__(
         voice_id: str,
         model_id: str = "sonic-english",
         base_url: str = "https://api.cartesia.ai",
-        encoding: str = "pcm_s16le",
-        sample_rate: int = 16000,
-        language: str = "en",
+        params: InputParams = InputParams(),
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -309,12 +313,15 @@ def __init__(
         self._api_key = api_key
         self._voice_id = voice_id
         self._model_id = model_id
+        self.set_model_name(model_id)
         self._output_format = {
-            "container": "raw",
-            "encoding": encoding,
-            "sample_rate": sample_rate,
+            "container": params.container,
+            "encoding": params.encoding,
+            "sample_rate": params.sample_rate,
         }
-        self._language = language
+        self._language = params.language
+        self._speed = params.speed
+        self._emotion = params.emotion
 
         self._client = AsyncCartesia(api_key=api_key, base_url=base_url)
 
@@ -324,11 +331,20 @@ def can_generate_metrics(self) -> bool:
     async def set_model(self, model: str):
         logger.debug(f"Switching TTS model to: [{model}]")
         self._model_id = model
+        await super().set_model(model)
 
     async def set_voice(self, voice: str):
         logger.debug(f"Switching TTS voice to: [{voice}]")
         self._voice_id = voice
 
+    async def set_speed(self, speed: str):
+        logger.debug(f"Switching TTS speed to: [{speed}]")
+        self._speed = speed
+
+    async def set_emotion(self, emotion: list[str]):
+        logger.debug(f"Switching TTS emotion to: [{emotion}]")
+        self._emotion = emotion
+
     async def set_language(self, language: Language):
         logger.debug(f"Switching TTS language to: [{language}]")
         self._language = language_to_cartesia_language(language)
@@ -348,6 +364,14 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         await self.start_ttfb_metrics()
 
         try:
+            voice_controls = None
+            if self._speed or self._emotion:
+                voice_controls = {}
+                if self._speed:
+                    voice_controls["speed"] = self._speed
+                if self._emotion:
+                    voice_controls["emotion"] = self._emotion
+
             output = await self._client.tts.sse(
                 model_id=self._model_id,
                 transcript=text,
@@ -355,6 +379,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 output_format=self._output_format,
                 language=self._language,
                 stream=False,
+                _experimental_voice_controls=voice_controls
             )
 
             await self.stop_ttfb_metrics()

From d05717a1bd709ecbf475bb2ff410763bcd8783cd Mon Sep 17 00:00:00 2001
From: Jin Kim <golbin@gmail.com>
Date: Thu, 26 Sep 2024 19:52:25 +0900
Subject: [PATCH 5/5] Apply Ruff formater

---
 src/pipecat/services/cartesia.py | 41 ++++++++++++++++----------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index f08c06dea..c1c296046 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -71,15 +71,16 @@ class InputParams(BaseModel):
         emotion: Optional[List[str]] = []
 
     def __init__(
-            self,
-            *,
-            api_key: str,
-            voice_id: str,
-            cartesia_version: str = "2024-06-10",
-            url: str = "wss://api.cartesia.ai/tts/websocket",
-            model_id: str = "sonic-english",
-            params: InputParams = InputParams(),
-            **kwargs):
+        self,
+        *,
+        api_key: str,
+        voice_id: str,
+        cartesia_version: str = "2024-06-10",
+        url: str = "wss://api.cartesia.ai/tts/websocket",
+        model_id: str = "sonic-english",
+        params: InputParams = InputParams(),
+        **kwargs,
+    ):
         # Aggregating sentences still gives cleaner-sounding results and fewer
         # artifacts than streaming one word at a time. On average, waiting for a
         # full sentence should only "cost" us 15ms or so with GPT-4o or a Llama
@@ -91,7 +92,10 @@ def __init__(
         # can use those to generate text frames ourselves aligned with the
         # playout timing of the audio!
         super().__init__(
-            aggregate_sentences=True, push_text_frames=False, sample_rate=params.sample_rate, **kwargs
+            aggregate_sentences=True,
+            push_text_frames=False,
+            sample_rate=params.sample_rate,
+            **kwargs,
         )
 
         self._api_key = api_key
@@ -137,11 +141,10 @@ async def set_language(self, language: Language):
         logger.debug(f"Switching TTS language to: [{language}]")
         self._language = language_to_cartesia_language(language)
 
-    def _build_msg(self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True):
-        voice_config = {
-            "mode": "id",
-            "id": self._voice_id
-        }
+    def _build_msg(
+        self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True
+    ):
+        voice_config = {"mode": "id", "id": self._voice_id}
 
         if self._speed or self._emotion:
             voice_config["__experimental_controls"] = {}
@@ -236,8 +239,7 @@ async def _receive_task_handler(self):
                     await self.add_word_timestamps([("LLMFullResponseEndFrame", 0)])
                 elif msg["type"] == "timestamps":
                     await self.add_word_timestamps(
-                        list(zip(msg["word_timestamps"]["words"],
-                             msg["word_timestamps"]["start"]))
+                        list(zip(msg["word_timestamps"]["words"], msg["word_timestamps"]["start"]))
                     )
                 elif msg["type"] == "chunk":
                     await self.stop_ttfb_metrics()
@@ -254,8 +256,7 @@ async def _receive_task_handler(self):
                     await self.stop_all_metrics()
                     await self.push_error(ErrorFrame(f'{self} error: {msg["error"]}'))
                 else:
-                    logger.error(
-                        f"Cartesia error, unknown message type: {msg}")
+                    logger.error(f"Cartesia error, unknown message type: {msg}")
         except asyncio.CancelledError:
             pass
         except Exception as e:
@@ -379,7 +380,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 output_format=self._output_format,
                 language=self._language,
                 stream=False,
-                _experimental_voice_controls=voice_controls
+                _experimental_voice_controls=voice_controls,
             )
 
             await self.stop_ttfb_metrics()