Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add speed and emotion options for Cartesia. #435

Merged
merged 8 commits into from
Sep 26, 2024
Merged
4 changes: 3 additions & 1 deletion examples/foundational/07d-interruptible-cartesia.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ async def main():
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
sample_rate=44100,
params=CartesiaTTSService.InputParams(
sample_rate=44100,
),
)

llm = OpenAILLMService(
Expand Down
4 changes: 3 additions & 1 deletion examples/foundational/12c-describe-video-anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ async def main():
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
sample_rate=16000,
params=CartesiaTTSService.InputParams(
sample_rate=16000,
),
)

@transport.event_handler("on_first_participant_joined")
Expand Down
4 changes: 3 additions & 1 deletion examples/studypal/studypal.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,9 @@ async def main():
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id=os.getenv("CARTESIA_VOICE_ID", "4d2fd738-3b3d-4368-957a-bb4805275bd9"),
# British Narration Lady: 4d2fd738-3b3d-4368-957a-bb4805275bd9
sample_rate=44100,
params=CartesiaTTSService.InputParams(
sample_rate=44100,
),
)

llm = OpenAILLMService(
Expand Down
45 changes: 31 additions & 14 deletions src/pipecat/services/cartesia.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
import asyncio
import time

from typing import AsyncGenerator, Mapping
from typing import AsyncGenerator, Optional
from pydantic.main import BaseModel

from pipecat.frames.frames import (
CancelFrame,
Expand Down Expand Up @@ -61,6 +62,14 @@ def language_to_cartesia_language(language: Language) -> str | None:


class CartesiaTTSService(TTSService):
class InputParams(BaseModel):
model_id: Optional[str] = "sonic-english"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd keep the model_id as a main argument as with all the other AI services. The rest looks great!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's also a conflict.

encoding: Optional[str] = "pcm_s16le"
sample_rate: Optional[int] = 16000
container: Optional[str] = "raw"
language: Optional[str] = "en"
speed: Optional[str] = None
emotion: Optional[list[str]] = []

def __init__(
self,
Expand All @@ -69,10 +78,7 @@ def __init__(
voice_id: str,
cartesia_version: str = "2024-06-10",
url: str = "wss://api.cartesia.ai/tts/websocket",
model_id: str = "sonic-english",
encoding: str = "pcm_s16le",
sample_rate: int = 16000,
language: str = "en",
params: InputParams = InputParams(),
**kwargs):
super().__init__(**kwargs)

Expand All @@ -92,13 +98,15 @@ def __init__(
self._cartesia_version = cartesia_version
self._url = url
self._voice_id = voice_id
self._model_id = model_id
self._model_id = params.model_id
self._output_format = {
"container": "raw",
"encoding": encoding,
"sample_rate": sample_rate,
"container": params.container,
"encoding": params.encoding,
"sample_rate": params.sample_rate,
}
self._language = language
self._language = params.language
self._speed = params.speed
self._emotion = params.emotion

self._websocket = None
self._context_id = None
Expand Down Expand Up @@ -249,15 +257,24 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
await self.start_ttfb_metrics()
self._context_id = str(uuid.uuid4())

voice_config = {
"mode": "id",
"id": self._voice_id
}

if self._speed or self._emotion:
voice_config["__experimental_controls"] = {}
if self._speed:
voice_config["__experimental_controls"]["speed"] = self._speed
if self._emotion:
voice_config["__experimental_controls"]["emotion"] = self._emotion

msg = {
"transcript": text + " ",
"continue": True,
"context_id": self._context_id,
"model_id": self._model_id,
"voice": {
"mode": "id",
"id": self._voice_id
},
"voice": voice_config,
"output_format": self._output_format,
"language": self._language,
"add_timestamps": True,
Expand Down