diff --git a/CHANGELOG.md b/CHANGELOG.md
index b59ed56c8..922a21f21 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -86,8 +86,8 @@ async def on_connected(processor):
### Changed
-- Updated individual update settings frame classes into a single UpdateSettingsFrame
- class for STT, LLM, and TTS.
+- Updated individual update settings frame classes into a single
+ ServiceUpdateSettingsFrame class.
- We now distinguish between input and output audio and image frames. We
introduce `InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame`
diff --git a/examples/foundational/07e-interruptible-playht.py b/examples/foundational/07e-interruptible-playht.py
index 9c48df93a..b05c0d9fd 100644
--- a/examples/foundational/07e-interruptible-playht.py
+++ b/examples/foundational/07e-interruptible-playht.py
@@ -4,11 +4,15 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
-import aiohttp
import asyncio
import os
import sys
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -17,17 +21,11 @@
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
-from pipecat.services.playht import PlayHTTTSService
from pipecat.services.openai import OpenAILLMService
+from pipecat.services.playht import PlayHTTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
-from runner import configure
-
-from loguru import logger
-
-from dotenv import load_dotenv
-
load_dotenv(override=True)
logger.remove(0)
diff --git a/examples/foundational/07g-interruptible-openai-tts.py b/examples/foundational/07g-interruptible-openai-tts.py
index 70576c97a..cabf1245e 100644
--- a/examples/foundational/07g-interruptible-openai-tts.py
+++ b/examples/foundational/07g-interruptible-openai-tts.py
@@ -4,11 +4,15 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
-import aiohttp
import asyncio
import os
import sys
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -17,17 +21,10 @@
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
-from pipecat.services.openai import OpenAITTSService
-from pipecat.services.openai import OpenAILLMService
+from pipecat.services.openai import OpenAILLMService, OpenAITTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
-from runner import configure
-
-from loguru import logger
-
-from dotenv import load_dotenv
-
load_dotenv(override=True)
logger.remove(0)
diff --git a/examples/foundational/16-gpu-container-local-bot.py b/examples/foundational/16-gpu-container-local-bot.py
index 06bf45195..55286eed5 100644
--- a/examples/foundational/16-gpu-container-local-bot.py
+++ b/examples/foundational/16-gpu-container-local-bot.py
@@ -5,10 +5,14 @@
#
import asyncio
-import aiohttp
import os
import sys
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -26,12 +30,6 @@
)
from pipecat.vad.silero import SileroVADAnalyzer
-from runner import configure
-
-from loguru import logger
-
-from dotenv import load_dotenv
-
load_dotenv(override=True)
logger.remove(0)
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index f7faa8ef0..e2ef78df5 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -5,7 +5,7 @@
#
from dataclasses import dataclass, field
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple
from pipecat.clocks.base_clock import BaseClock
from pipecat.metrics.metrics import MetricsData
@@ -527,45 +527,25 @@ def __str__(self):
@dataclass
-class LLMUpdateSettingsFrame(ControlFrame):
- """A control frame containing a request to update LLM settings."""
+class ServiceUpdateSettingsFrame(ControlFrame):
+ """A control frame containing a request to update service settings."""
- model: Optional[str] = None
- temperature: Optional[float] = None
- top_k: Optional[int] = None
- top_p: Optional[float] = None
- frequency_penalty: Optional[float] = None
- presence_penalty: Optional[float] = None
- max_tokens: Optional[int] = None
- seed: Optional[int] = None
- extra: dict = field(default_factory=dict)
+ settings: Dict[str, Any]
@dataclass
-class TTSUpdateSettingsFrame(ControlFrame):
- """A control frame containing a request to update TTS settings."""
-
- model: Optional[str] = None
- voice: Optional[str] = None
- language: Optional[Language] = None
- speed: Optional[Union[str, float]] = None
- emotion: Optional[List[str]] = None
- engine: Optional[str] = None
- pitch: Optional[str] = None
- rate: Optional[str] = None
- volume: Optional[str] = None
- emphasis: Optional[str] = None
- style: Optional[str] = None
- style_degree: Optional[str] = None
- role: Optional[str] = None
+class LLMUpdateSettingsFrame(ServiceUpdateSettingsFrame):
+ pass
@dataclass
-class STTUpdateSettingsFrame(ControlFrame):
- """A control frame containing a request to update STT settings."""
+class TTSUpdateSettingsFrame(ServiceUpdateSettingsFrame):
+ pass
- model: Optional[str] = None
- language: Optional[Language] = None
+
+@dataclass
+class STTUpdateSettingsFrame(ServiceUpdateSettingsFrame):
+ pass
@dataclass
diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index a329239dc..64d79b582 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -8,7 +8,7 @@
import io
import wave
from abc import abstractmethod
-from typing import AsyncGenerator, List, Optional, Tuple, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
from loguru import logger
@@ -45,6 +45,7 @@ class AIService(FrameProcessor):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._model_name: str = ""
+ self._settings: Dict[str, Any] = {}
@property
def model_name(self) -> str:
@@ -63,6 +64,16 @@ async def stop(self, frame: EndFrame):
async def cancel(self, frame: CancelFrame):
pass
+ async def _update_settings(self, settings: Dict[str, Any]):
+ for key, value in settings.items():
+ if key in self._settings:
+ logger.debug(f"Updating setting {key} to: [{value}] for {self.name}")
+ self._settings[key] = value
+ elif key == "model":
+ self.set_model_name(value)
+ else:
+ logger.warning(f"Unknown setting for {self.name} service: {key}")
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -169,6 +180,8 @@ def __init__(
self._push_stop_frames: bool = push_stop_frames
self._stop_frame_timeout_s: float = stop_frame_timeout_s
self._sample_rate: int = sample_rate
+ self._voice_id: str = ""
+ self._settings: Dict[str, Any] = {}
self._stop_frame_task: Optional[asyncio.Task] = None
self._stop_frame_queue: asyncio.Queue = asyncio.Queue()
@@ -184,52 +197,8 @@ async def set_model(self, model: str):
self.set_model_name(model)
@abstractmethod
- async def set_voice(self, voice: str):
- pass
-
- @abstractmethod
- async def set_language(self, language: Language):
- pass
-
- @abstractmethod
- async def set_speed(self, speed: Union[str, float]):
- pass
-
- @abstractmethod
- async def set_emotion(self, emotion: List[str]):
- pass
-
- @abstractmethod
- async def set_engine(self, engine: str):
- pass
-
- @abstractmethod
- async def set_pitch(self, pitch: str):
- pass
-
- @abstractmethod
- async def set_rate(self, rate: str):
- pass
-
- @abstractmethod
- async def set_volume(self, volume: str):
- pass
-
- @abstractmethod
- async def set_emphasis(self, emphasis: str):
- pass
-
- @abstractmethod
- async def set_style(self, style: str):
- pass
-
- @abstractmethod
- async def set_style_degree(self, style_degree: str):
- pass
-
- @abstractmethod
- async def set_role(self, role: str):
- pass
+ def set_voice(self, voice: str):
+ self._voice_id = voice
@abstractmethod
async def flush_audio(self):
@@ -259,6 +228,20 @@ async def cancel(self, frame: CancelFrame):
await self._stop_frame_task
self._stop_frame_task = None
+ async def _update_settings(self, settings: Dict[str, Any]):
+ for key, value in settings.items():
+ if key in self._settings:
+ logger.debug(f"Updating TTS setting {key} to: [{value}]")
+ self._settings[key] = value
+ if key == "language":
+ self._settings[key] = Language(value)
+ elif key == "model":
+ self.set_model_name(value)
+ elif key == "voice":
+ self.set_voice(value)
+ else:
+ logger.warning(f"Unknown setting for TTS service: {key}")
+
async def say(self, text: str):
aggregate_sentences = self._aggregate_sentences
self._aggregate_sentences = False
@@ -286,7 +269,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
await self._push_tts_frames(frame.text)
await self.flush_audio()
elif isinstance(frame, TTSUpdateSettingsFrame):
- await self._update_tts_settings(frame)
+ await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
@@ -333,34 +316,6 @@ async def _push_tts_frames(self, text: str):
# interrupted, the text is not added to the assistant context.
await self.push_frame(TextFrame(text))
- async def _update_tts_settings(self, frame: TTSUpdateSettingsFrame):
- if frame.model is not None:
- await self.set_model(frame.model)
- if frame.voice is not None:
- await self.set_voice(frame.voice)
- if frame.language is not None:
- await self.set_language(frame.language)
- if frame.speed is not None:
- await self.set_speed(frame.speed)
- if frame.emotion is not None:
- await self.set_emotion(frame.emotion)
- if frame.engine is not None:
- await self.set_engine(frame.engine)
- if frame.pitch is not None:
- await self.set_pitch(frame.pitch)
- if frame.rate is not None:
- await self.set_rate(frame.rate)
- if frame.volume is not None:
- await self.set_volume(frame.volume)
- if frame.emphasis is not None:
- await self.set_emphasis(frame.emphasis)
- if frame.style is not None:
- await self.set_style(frame.style)
- if frame.style_degree is not None:
- await self.set_style_degree(frame.style_degree)
- if frame.role is not None:
- await self.set_role(frame.role)
-
async def _stop_frame_handler(self):
try:
has_started = False
@@ -446,25 +401,29 @@ class STTService(AIService):
def __init__(self, **kwargs):
super().__init__(**kwargs)
+ self._settings: Dict[str, Any] = {}
@abstractmethod
async def set_model(self, model: str):
self.set_model_name(model)
- @abstractmethod
- async def set_language(self, language: Language):
- pass
-
@abstractmethod
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
"""Returns transcript as a string"""
pass
- async def _update_stt_settings(self, frame: STTUpdateSettingsFrame):
- if frame.model is not None:
- await self.set_model(frame.model)
- if frame.language is not None:
- await self.set_language(frame.language)
+ async def _update_settings(self, settings: Dict[str, Any]):
+ logger.debug(f"Updating STT settings: {self._settings}")
+ for key, value in settings.items():
+ if key in self._settings:
+ logger.debug(f"Updating STT setting {key} to: [{value}]")
+ self._settings[key] = value
+ if key == "language":
+ self._settings[key] = Language(value)
+ elif key == "model":
+ self.set_model_name(value)
+ else:
+ logger.warning(f"Unknown setting for STT service: {key}")
async def process_audio_frame(self, frame: AudioRawFrame):
await self.process_generator(self.run_stt(frame.audio))
@@ -478,7 +437,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
# push a TextFrame. We don't really want to push audio frames down.
await self.process_audio_frame(frame)
elif isinstance(frame, STTUpdateSettingsFrame):
- await self._update_stt_settings(frame)
+ await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py
index 639a922e6..1b7064209 100644
--- a/src/pipecat/services/anthropic.py
+++ b/src/pipecat/services/anthropic.py
@@ -96,12 +96,14 @@ def __init__(
super().__init__(**kwargs)
self._client = AsyncAnthropic(api_key=api_key)
self.set_model_name(model)
- self._max_tokens = params.max_tokens
- self._enable_prompt_caching_beta: bool = params.enable_prompt_caching_beta or False
- self._temperature = params.temperature
- self._top_k = params.top_k
- self._top_p = params.top_p
- self._extra = params.extra if isinstance(params.extra, dict) else {}
+ self._settings = {
+ "max_tokens": params.max_tokens,
+ "enable_prompt_caching_beta": params.enable_prompt_caching_beta or False,
+ "temperature": params.temperature,
+ "top_k": params.top_k,
+ "top_p": params.top_p,
+ "extra": params.extra if isinstance(params.extra, dict) else {},
+ }
def can_generate_metrics(self) -> bool:
return True
@@ -120,30 +122,6 @@ def create_context_aggregator(
)
return AnthropicContextAggregatorPair(_user=user, _assistant=assistant)
- async def set_enable_prompt_caching_beta(self, enable_prompt_caching_beta: bool):
- logger.debug(f"Switching LLM enable_prompt_caching_beta to: [{enable_prompt_caching_beta}]")
- self._enable_prompt_caching_beta = enable_prompt_caching_beta
-
- async def set_max_tokens(self, max_tokens: int):
- logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]")
- self._max_tokens = max_tokens
-
- async def set_temperature(self, temperature: float):
- logger.debug(f"Switching LLM temperature to: [{temperature}]")
- self._temperature = temperature
-
- async def set_top_k(self, top_k: float):
- logger.debug(f"Switching LLM top_k to: [{top_k}]")
- self._top_k = top_k
-
- async def set_top_p(self, top_p: float):
- logger.debug(f"Switching LLM top_p to: [{top_p}]")
- self._top_p = top_p
-
- async def set_extra(self, extra: Dict[str, Any]):
- logger.debug(f"Switching LLM extra to: [{extra}]")
- self._extra = extra
-
async def _process_context(self, context: OpenAILLMContext):
# Usage tracking. We track the usage reported by Anthropic in prompt_tokens and
# completion_tokens. We also estimate the completion tokens from output text
@@ -165,11 +143,11 @@ async def _process_context(self, context: OpenAILLMContext):
)
messages = context.messages
- if self._enable_prompt_caching_beta:
+ if self._settings["enable_prompt_caching_beta"]:
messages = context.get_messages_with_cache_control_markers()
api_call = self._client.messages.create
- if self._enable_prompt_caching_beta:
+ if self._settings["enable_prompt_caching_beta"]:
api_call = self._client.beta.prompt_caching.messages.create
await self.start_ttfb_metrics()
@@ -179,14 +157,14 @@ async def _process_context(self, context: OpenAILLMContext):
"system": context.system,
"messages": messages,
"model": self.model_name,
- "max_tokens": self._max_tokens,
+ "max_tokens": self._settings["max_tokens"],
"stream": True,
- "temperature": self._temperature,
- "top_k": self._top_k,
- "top_p": self._top_p,
+ "temperature": self._settings["temperature"],
+ "top_k": self._settings["top_k"],
+ "top_p": self._settings["top_p"],
}
- params.update(self._extra)
+ params.update(self._settings["extra"])
response = await api_call(**params)
@@ -284,21 +262,6 @@ async def _process_context(self, context: OpenAILLMContext):
cache_read_input_tokens=cache_read_input_tokens,
)
- async def _update_settings(self, frame: LLMUpdateSettingsFrame):
- if frame.model is not None:
- logger.debug(f"Switching LLM model to: [{frame.model}]")
- self.set_model_name(frame.model)
- if frame.max_tokens is not None:
- await self.set_max_tokens(frame.max_tokens)
- if frame.temperature is not None:
- await self.set_temperature(frame.temperature)
- if frame.top_k is not None:
- await self.set_top_k(frame.top_k)
- if frame.top_p is not None:
- await self.set_top_p(frame.top_p)
- if frame.extra:
- await self.set_extra(frame.extra)
-
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -314,10 +277,10 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
# to the context.
context = AnthropicLLMContext.from_image_frame(frame)
elif isinstance(frame, LLMUpdateSettingsFrame):
- await self._update_settings(frame)
+ await self._update_settings(frame.settings)
elif isinstance(frame, LLMEnablePromptCachingFrame):
logger.debug(f"Setting enable prompt caching to: [{frame.enable}]")
- self._enable_prompt_caching_beta = frame.enable
+ self._settings["enable_prompt_caching_beta"] = frame.enable
else:
await self.push_frame(frame, direction)
diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py
index 80240985f..210e0e700 100644
--- a/src/pipecat/services/aws.py
+++ b/src/pipecat/services/aws.py
@@ -6,6 +6,7 @@
from typing import AsyncGenerator, Optional
+from loguru import logger
from pydantic import BaseModel
from pipecat.frames.frames import (
@@ -16,8 +17,7 @@
TTSStoppedFrame,
)
from pipecat.services.ai_services import TTSService
-
-from loguru import logger
+from pipecat.transcriptions.language import Language
try:
import boto3
@@ -30,10 +30,71 @@
raise Exception(f"Missing module: {e}")
+def language_to_aws_language(language: Language) -> str | None:
+ match language:
+ case Language.CA:
+ return "ca-ES"
+ case Language.ZH:
+ return "cmn-CN"
+ case Language.DA:
+ return "da-DK"
+ case Language.NL:
+ return "nl-NL"
+ case Language.NL_BE:
+ return "nl-BE"
+ case Language.EN:
+ return "en-US"
+ case Language.EN_US:
+ return "en-US"
+ case Language.EN_AU:
+ return "en-AU"
+ case Language.EN_GB:
+ return "en-GB"
+ case Language.EN_NZ:
+ return "en-NZ"
+ case Language.EN_IN:
+ return "en-IN"
+ case Language.FI:
+ return "fi-FI"
+ case Language.FR:
+ return "fr-FR"
+ case Language.FR_CA:
+ return "fr-CA"
+ case Language.DE:
+ return "de-DE"
+ case Language.HI:
+ return "hi-IN"
+ case Language.IT:
+ return "it-IT"
+ case Language.JA:
+ return "ja-JP"
+ case Language.KO:
+ return "ko-KR"
+ case Language.NO:
+ return "nb-NO"
+ case Language.PL:
+ return "pl-PL"
+ case Language.PT:
+ return "pt-PT"
+ case Language.PT_BR:
+ return "pt-BR"
+ case Language.RO:
+ return "ro-RO"
+ case Language.RU:
+ return "ru-RU"
+ case Language.ES:
+ return "es-ES"
+ case Language.SV:
+ return "sv-SE"
+ case Language.TR:
+ return "tr-TR"
+ return None
+
+
class AWSTTSService(TTSService):
class InputParams(BaseModel):
engine: Optional[str] = None
- language: Optional[str] = None
+ language: Optional[Language] = Language.EN
pitch: Optional[str] = None
rate: Optional[str] = None
volume: Optional[str] = None
@@ -57,9 +118,16 @@ def __init__(
aws_secret_access_key=api_key,
region_name=region,
)
- self._voice_id = voice_id
- self._sample_rate = sample_rate
- self._params = params
+ self._settings = {
+ "sample_rate": sample_rate,
+ "engine": params.engine,
+ "language": language_to_aws_language(params.language) if params.language else "en-US",
+ "pitch": params.pitch,
+ "rate": params.rate,
+ "volume": params.volume,
+ }
+
+ self.set_voice(voice_id)
def can_generate_metrics(self) -> bool:
return True
@@ -67,18 +135,18 @@ def can_generate_metrics(self) -> bool:
def _construct_ssml(self, text: str) -> str:
ssml = ""
- if self._params.language:
- ssml += f""
+ if self._settings["language"]:
+ ssml += f""
prosody_attrs = []
# Prosody tags are only supported for standard and neural engines
- if self._params.engine != "generative":
- if self._params.rate:
- prosody_attrs.append(f"rate='{self._params.rate}'")
- if self._params.pitch:
- prosody_attrs.append(f"pitch='{self._params.pitch}'")
- if self._params.volume:
- prosody_attrs.append(f"volume='{self._params.volume}'")
+ if self._settings["engine"] != "generative":
+ if self._settings["rate"]:
+ prosody_attrs.append(f"rate='{self._settings["rate"]}'")
+ if self._settings["pitch"]:
+ prosody_attrs.append(f"pitch='{self._settings["pitch"]}'")
+ if self._settings["volume"]:
+ prosody_attrs.append(f"volume='{self._settings["volume"]}'")
if prosody_attrs:
ssml += f""
@@ -90,41 +158,13 @@ def _construct_ssml(self, text: str) -> str:
if prosody_attrs:
ssml += ""
- if self._params.language:
+ if self._settings["language"]:
ssml += ""
ssml += ""
return ssml
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
- async def set_engine(self, engine: str):
- logger.debug(f"Switching TTS engine to: [{engine}]")
- self._params.engine = engine
-
- async def set_language(self, language: str):
- logger.debug(f"Switching TTS language to: [{language}]")
- self._params.language = language
-
- async def set_pitch(self, pitch: str):
- logger.debug(f"Switching TTS pitch to: [{pitch}]")
- self._params.pitch = pitch
-
- async def set_rate(self, rate: str):
- logger.debug(f"Switching TTS rate to: [{rate}]")
- self._params.rate = rate
-
- async def set_volume(self, volume: str):
- logger.debug(f"Switching TTS volume to: [{volume}]")
- self._params.volume = volume
-
- async def set_params(self, params: InputParams):
- logger.debug(f"Switching TTS params to: [{params}]")
- self._params = params
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -139,8 +179,8 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
"TextType": "ssml",
"OutputFormat": "pcm",
"VoiceId": self._voice_id,
- "Engine": self._params.engine,
- "SampleRate": str(self._sample_rate),
+ "Engine": self._settings["engine"],
+ "SampleRate": str(self._settings["sample_rate"]),
}
# Filter out None values
@@ -160,7 +200,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
chunk = audio_data[i : i + chunk_size]
if len(chunk) > 0:
await self.stop_ttfb_metrics()
- frame = TTSAudioRawFrame(chunk, self._sample_rate, 1)
+ frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py
index a1349cefe..32af1b3f4 100644
--- a/src/pipecat/services/azure.py
+++ b/src/pipecat/services/azure.py
@@ -4,12 +4,13 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
-import aiohttp
import asyncio
import io
-
from typing import AsyncGenerator, Optional
+import aiohttp
+from loguru import logger
+from PIL import Image
from pydantic import BaseModel
from pipecat.frames.frames import (
@@ -26,12 +27,9 @@
)
from pipecat.services.ai_services import ImageGenService, STTService, TTSService
from pipecat.services.openai import BaseOpenAILLMService
+from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
-from PIL import Image
-
-from loguru import logger
-
# See .env.example for Azure configuration needed
try:
from azure.cognitiveservices.speech import (
@@ -73,10 +71,101 @@ def create_client(self, api_key=None, base_url=None, **kwargs):
)
+def language_to_azure_language(language: Language) -> str | None:
+ match language:
+ case Language.BG:
+ return "bg-BG"
+ case Language.CA:
+ return "ca-ES"
+ case Language.ZH:
+ return "zh-CN"
+ case Language.ZH_TW:
+ return "zh-TW"
+ case Language.CS:
+ return "cs-CZ"
+ case Language.DA:
+ return "da-DK"
+ case Language.NL:
+ return "nl-NL"
+ case Language.EN:
+ return "en-US"
+ case Language.EN_US:
+ return "en-US"
+ case Language.EN_AU:
+ return "en-AU"
+ case Language.EN_GB:
+ return "en-GB"
+ case Language.EN_NZ:
+ return "en-NZ"
+ case Language.EN_IN:
+ return "en-IN"
+ case Language.ET:
+ return "et-EE"
+ case Language.FI:
+ return "fi-FI"
+ case Language.NL_BE:
+ return "nl-BE"
+ case Language.FR:
+ return "fr-FR"
+ case Language.FR_CA:
+ return "fr-CA"
+ case Language.DE:
+ return "de-DE"
+ case Language.DE_CH:
+ return "de-CH"
+ case Language.EL:
+ return "el-GR"
+ case Language.HI:
+ return "hi-IN"
+ case Language.HU:
+ return "hu-HU"
+ case Language.ID:
+ return "id-ID"
+ case Language.IT:
+ return "it-IT"
+ case Language.JA:
+ return "ja-JP"
+ case Language.KO:
+ return "ko-KR"
+ case Language.LV:
+ return "lv-LV"
+ case Language.LT:
+ return "lt-LT"
+ case Language.MS:
+ return "ms-MY"
+ case Language.NO:
+ return "nb-NO"
+ case Language.PL:
+ return "pl-PL"
+ case Language.PT:
+ return "pt-PT"
+ case Language.PT_BR:
+ return "pt-BR"
+ case Language.RO:
+ return "ro-RO"
+ case Language.RU:
+ return "ru-RU"
+ case Language.SK:
+ return "sk-SK"
+ case Language.ES:
+ return "es-ES"
+ case Language.SV:
+ return "sv-SE"
+ case Language.TH:
+ return "th-TH"
+ case Language.TR:
+ return "tr-TR"
+ case Language.UK:
+ return "uk-UA"
+ case Language.VI:
+ return "vi-VN"
+ return None
+
+
class AzureTTSService(TTSService):
class InputParams(BaseModel):
emphasis: Optional[str] = None
- language: Optional[str] = "en-US"
+ language: Optional[Language] = Language.EN
pitch: Optional[str] = None
rate: Optional[str] = "1.05"
role: Optional[str] = None
@@ -99,114 +188,67 @@ def __init__(
speech_config = SpeechConfig(subscription=api_key, region=region)
self._speech_synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None)
- self._voice = voice
- self._sample_rate = sample_rate
- self._params = params
+ self._settings = {
+ "sample_rate": sample_rate,
+ "emphasis": params.emphasis,
+ "language": language_to_azure_language(params.language) if params.language else "en-US",
+ "pitch": params.pitch,
+ "rate": params.rate,
+ "role": params.role,
+ "style": params.style,
+ "style_degree": params.style_degree,
+ "volume": params.volume,
+ }
+
+ self.set_voice(voice)
def can_generate_metrics(self) -> bool:
return True
def _construct_ssml(self, text: str) -> str:
ssml = (
- f""
- f""
+ f""
""
)
- if self._params.style:
- ssml += f""
prosody_attrs = []
- if self._params.rate:
- prosody_attrs.append(f"rate='{self._params.rate}'")
- if self._params.pitch:
- prosody_attrs.append(f"pitch='{self._params.pitch}'")
- if self._params.volume:
- prosody_attrs.append(f"volume='{self._params.volume}'")
+ if self._settings["rate"]:
+ prosody_attrs.append(f"rate='{self._settings['rate']}'")
+ if self._settings["pitch"]:
+ prosody_attrs.append(f"pitch='{self._settings['pitch']}'")
+ if self._settings["volume"]:
+ prosody_attrs.append(f"volume='{self._settings['volume']}'")
ssml += f""
- if self._params.emphasis:
- ssml += f""
+ if self._settings["emphasis"]:
+ ssml += f""
ssml += text
- if self._params.emphasis:
+ if self._settings["emphasis"]:
ssml += ""
ssml += ""
- if self._params.style:
+ if self._settings["style"]:
ssml += ""
ssml += ""
return ssml
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice = voice
-
- async def set_emphasis(self, emphasis: str):
- logger.debug(f"Setting TTS emphasis to: [{emphasis}]")
- self._params.emphasis = emphasis
-
- async def set_language(self, language: str):
- logger.debug(f"Setting TTS language code to: [{language}]")
- self._params.language = language
-
- async def set_pitch(self, pitch: str):
- logger.debug(f"Setting TTS pitch to: [{pitch}]")
- self._params.pitch = pitch
-
- async def set_rate(self, rate: str):
- logger.debug(f"Setting TTS rate to: [{rate}]")
- self._params.rate = rate
-
- async def set_role(self, role: str):
- logger.debug(f"Setting TTS role to: [{role}]")
- self._params.role = role
-
- async def set_style(self, style: str):
- logger.debug(f"Setting TTS style to: [{style}]")
- self._params.style = style
-
- async def set_style_degree(self, style_degree: str):
- logger.debug(f"Setting TTS style degree to: [{style_degree}]")
- self._params.style_degree = style_degree
-
- async def set_volume(self, volume: str):
- logger.debug(f"Setting TTS volume to: [{volume}]")
- self._params.volume = volume
-
- async def set_params(self, **kwargs):
- valid_params = {
- "voice": self.set_voice,
- "emphasis": self.set_emphasis,
- "language_code": self.set_language,
- "pitch": self.set_pitch,
- "rate": self.set_rate,
- "role": self.set_role,
- "style": self.set_style,
- "style_degree": self.set_style_degree,
- "volume": self.set_volume,
- }
-
- for param, value in kwargs.items():
- if param in valid_params:
- await valid_params[param](value)
- else:
- logger.warning(f"Ignoring unknown parameter: {param}")
-
- logger.debug(f"Updated TTS parameters: {', '.join(kwargs.keys())}")
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -222,7 +264,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
await self.push_frame(TTSStartedFrame())
# Azure always sends a 44-byte header. Strip it off.
yield TTSAudioRawFrame(
- audio=result.audio_data[44:], sample_rate=self._sample_rate, num_channels=1
+ audio=result.audio_data[44:],
+ sample_rate=self._settings["sample_rate"],
+ num_channels=1,
)
await self.push_frame(TTSStoppedFrame())
elif result.reason == ResultReason.Canceled:
diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index 5f798b1e5..22e09cb2b 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -4,36 +4,35 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
+import asyncio
+import base64
import json
import uuid
-import base64
-import asyncio
+from typing import AsyncGenerator, List, Optional, Union
-from typing import AsyncGenerator, Optional, Union, List
+from loguru import logger
from pydantic.main import BaseModel
from pipecat.frames.frames import (
CancelFrame,
+ EndFrame,
ErrorFrame,
Frame,
- StartInterruptionFrame,
+ LLMFullResponseEndFrame,
StartFrame,
- EndFrame,
+ StartInterruptionFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
- LLMFullResponseEndFrame,
)
from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.ai_services import TTSService, WordTTSService
from pipecat.transcriptions.language import Language
-from pipecat.services.ai_services import WordTTSService, TTSService
-
-from loguru import logger
# See .env.example for Cartesia configuration needed
try:
- from cartesia import AsyncCartesia
import websockets
+ from cartesia import AsyncCartesia
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
@@ -66,7 +65,7 @@ class InputParams(BaseModel):
encoding: Optional[str] = "pcm_s16le"
sample_rate: Optional[int] = 16000
container: Optional[str] = "raw"
- language: Optional[str] = "en"
+ language: Optional[Language] = Language.EN
speed: Optional[Union[str, float]] = ""
emotion: Optional[List[str]] = []
@@ -77,7 +76,7 @@ def __init__(
voice_id: str,
cartesia_version: str = "2024-06-10",
url: str = "wss://api.cartesia.ai/tts/websocket",
- model_id: str = "sonic-english",
+ model: str = "sonic-english",
params: InputParams = InputParams(),
**kwargs,
):
@@ -101,17 +100,18 @@ def __init__(
self._api_key = api_key
self._cartesia_version = cartesia_version
self._url = url
- self._voice_id = voice_id
- self._model_id = model_id
- self.set_model_name(model_id)
- self._output_format = {
- "container": params.container,
- "encoding": params.encoding,
- "sample_rate": params.sample_rate,
+ self._settings = {
+ "output_format": {
+ "container": params.container,
+ "encoding": params.encoding,
+ "sample_rate": params.sample_rate,
+ },
+ "language": language_to_cartesia_language(params.language) if params.language else "en",
+ "speed": params.speed,
+ "emotion": params.emotion,
}
- self._language = params.language
- self._speed = params.speed
- self._emotion = params.emotion
+ self.set_model_name(model)
+ self.set_voice(voice_id)
self._websocket = None
self._context_id = None
@@ -125,42 +125,28 @@ async def set_model(self, model: str):
await super().set_model(model)
logger.debug(f"Switching TTS model to: [{model}]")
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
- async def set_speed(self, speed: str):
- logger.debug(f"Switching TTS speed to: [{speed}]")
- self._speed = speed
-
- async def set_emotion(self, emotion: list[str]):
- logger.debug(f"Switching TTS emotion to: [{emotion}]")
- self._emotion = emotion
-
- async def set_language(self, language: Language):
- logger.debug(f"Switching TTS language to: [{language}]")
- self._language = language_to_cartesia_language(language)
-
def _build_msg(
self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True
):
- voice_config = {"mode": "id", "id": self._voice_id}
+ voice_config = {}
+ voice_config["mode"] = "id"
+ voice_config["id"] = self._voice_id
- if self._speed or self._emotion:
+ if self._settings["speed"] or self._settings["emotion"]:
voice_config["__experimental_controls"] = {}
- if self._speed:
- voice_config["__experimental_controls"]["speed"] = self._speed
- if self._emotion:
- voice_config["__experimental_controls"]["emotion"] = self._emotion
+ if self._settings["speed"]:
+ voice_config["__experimental_controls"]["speed"] = self._settings["speed"]
+ if self._settings["emotion"]:
+ voice_config["__experimental_controls"]["emotion"] = self._settings["emotion"]
msg = {
"transcript": text,
"continue": continue_transcript,
"context_id": self._context_id,
- "model_id": self._model_name,
+ "model_id": self.model_name,
"voice": voice_config,
- "output_format": self._output_format,
- "language": self._language,
+ "output_format": self._settings["output_format"],
+ "language": self._settings["language"],
"add_timestamps": add_timestamps,
}
return json.dumps(msg)
@@ -245,7 +231,7 @@ async def _receive_task_handler(self):
self.start_word_timestamps()
frame = TTSAudioRawFrame(
audio=base64.b64decode(msg["data"]),
- sample_rate=self._output_format["sample_rate"],
+ sample_rate=self._settings["output_format"]["sample_rate"],
num_channels=1,
)
await self.push_frame(frame)
@@ -294,7 +280,7 @@ class InputParams(BaseModel):
encoding: Optional[str] = "pcm_s16le"
sample_rate: Optional[int] = 16000
container: Optional[str] = "raw"
- language: Optional[str] = "en"
+ language: Optional[Language] = Language.EN
speed: Optional[Union[str, float]] = ""
emotion: Optional[List[str]] = []
@@ -303,7 +289,7 @@ def __init__(
*,
api_key: str,
voice_id: str,
- model_id: str = "sonic-english",
+ model: str = "sonic-english",
base_url: str = "https://api.cartesia.ai",
params: InputParams = InputParams(),
**kwargs,
@@ -311,17 +297,18 @@ def __init__(
super().__init__(**kwargs)
self._api_key = api_key
- self._voice_id = voice_id
- self._model_id = model_id
- self.set_model_name(model_id)
- self._output_format = {
- "container": params.container,
- "encoding": params.encoding,
- "sample_rate": params.sample_rate,
+ self._settings = {
+ "output_format": {
+ "container": params.container,
+ "encoding": params.encoding,
+ "sample_rate": params.sample_rate,
+ },
+ "language": language_to_cartesia_language(params.language) if params.language else None,
+ "speed": params.speed,
+ "emotion": params.emotion,
}
- self._language = params.language
- self._speed = params.speed
- self._emotion = params.emotion
+ self.set_voice(voice_id)
+ self.set_model_name(model)
self._client = AsyncCartesia(api_key=api_key, base_url=base_url)
@@ -333,22 +320,6 @@ async def set_model(self, model: str):
self._model_id = model
await super().set_model(model)
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
- async def set_speed(self, speed: str):
- logger.debug(f"Switching TTS speed to: [{speed}]")
- self._speed = speed
-
- async def set_emotion(self, emotion: list[str]):
- logger.debug(f"Switching TTS emotion to: [{emotion}]")
- self._emotion = emotion
-
- async def set_language(self, language: Language):
- logger.debug(f"Switching TTS language to: [{language}]")
- self._language = language_to_cartesia_language(language)
-
async def stop(self, frame: EndFrame):
await super().stop(frame)
await self._client.close()
@@ -365,19 +336,19 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
try:
voice_controls = None
- if self._speed or self._emotion:
+ if self._settings["speed"] or self._settings["emotion"]:
voice_controls = {}
- if self._speed:
- voice_controls["speed"] = self._speed
- if self._emotion:
- voice_controls["emotion"] = self._emotion
+ if self._settings["speed"]:
+ voice_controls["speed"] = self._settings["speed"]
+ if self._settings["emotion"]:
+ voice_controls["emotion"] = self._settings["emotion"]
output = await self._client.tts.sse(
model_id=self._model_id,
transcript=text,
voice_id=self._voice_id,
- output_format=self._output_format,
- language=self._language,
+ output_format=self._settings["output_format"],
+ language=self._settings["language"],
stream=False,
_experimental_voice_controls=voice_controls,
)
@@ -386,7 +357,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
frame = TTSAudioRawFrame(
audio=output["audio"],
- sample_rate=self._output_format["sample_rate"],
+ sample_rate=self._settings["output_format"]["sample_rate"],
num_channels=1,
)
yield frame
diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py
index d109cce3c..433e08172 100644
--- a/src/pipecat/services/deepgram.py
+++ b/src/pipecat/services/deepgram.py
@@ -5,9 +5,10 @@
#
import asyncio
-
from typing import AsyncGenerator
+from loguru import logger
+
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
@@ -24,8 +25,6 @@
from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
-from loguru import logger
-
# See .env.example for Deepgram configuration needed
try:
from deepgram import (
@@ -57,25 +56,23 @@ def __init__(
):
super().__init__(**kwargs)
- self._voice = voice
- self._sample_rate = sample_rate
- self._encoding = encoding
+ self._settings = {
+ "sample_rate": sample_rate,
+ "encoding": encoding,
+ }
+ self.set_voice(voice)
self._deepgram_client = DeepgramClient(api_key=api_key)
def can_generate_metrics(self) -> bool:
return True
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice = voice
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
options = SpeakOptions(
- model=self._voice,
- encoding=self._encoding,
- sample_rate=self._sample_rate,
+ model=self._voice_id,
+ encoding=self._settings["encoding"],
+ sample_rate=self._settings["sample_rate"],
container="none",
)
@@ -103,7 +100,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
chunk = audio_buffer.read(chunk_size)
if not chunk:
break
- frame = TTSAudioRawFrame(audio=chunk, sample_rate=self._sample_rate, num_channels=1)
+ frame = TTSAudioRawFrame(
+ audio=chunk, sample_rate=self._settings["sample_rate"], num_channels=1
+ )
yield frame
await self.push_frame(TTSStoppedFrame())
@@ -121,7 +120,7 @@ def __init__(
url: str = "",
live_options: LiveOptions = LiveOptions(
encoding="linear16",
- language="en-US",
+ language=Language.EN,
model="nova-2-conversationalai",
sample_rate=16000,
channels=1,
@@ -135,7 +134,7 @@ def __init__(
):
super().__init__(**kwargs)
- self._live_options = live_options
+ self._settings = vars(live_options)
self._client = DeepgramClient(
api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"})
@@ -147,7 +146,7 @@ def __init__(
@property
def vad_enabled(self):
- return self._live_options.vad_events
+ return self._settings["vad_events"]
def can_generate_metrics(self) -> bool:
return self.vad_enabled
@@ -155,13 +154,7 @@ def can_generate_metrics(self) -> bool:
async def set_model(self, model: str):
await super().set_model(model)
logger.debug(f"Switching STT model to: [{model}]")
- self._live_options.model = model
- await self._disconnect()
- await self._connect()
-
- async def set_language(self, language: Language):
- logger.debug(f"Switching STT language to: [{language}]")
- self._live_options.language = language
+ self._settings["model"] = model
await self._disconnect()
await self._connect()
@@ -182,7 +175,7 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
yield None
async def _connect(self):
- if await self._connection.start(self._live_options):
+ if await self._connection.start(self._settings):
logger.debug(f"{self}: Connected to Deepgram")
else:
logger.error(f"{self}: Unable to connect to Deepgram")
diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py
index 611f2a024..695988b03 100644
--- a/src/pipecat/services/elevenlabs.py
+++ b/src/pipecat/services/elevenlabs.py
@@ -24,6 +24,7 @@
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import WordTTSService
+from pipecat.transcriptions.language import Language
# See .env.example for ElevenLabs configuration needed
try:
@@ -49,6 +50,76 @@ def sample_rate_from_output_format(output_format: str) -> int:
return 16000
+def language_to_elevenlabs_language(language: Language) -> str | None:
+ match language:
+ case Language.BG:
+ return "bg"
+ case Language.ZH:
+ return "zh"
+ case Language.CS:
+ return "cs"
+ case Language.DA:
+ return "da"
+ case Language.NL:
+ return "nl"
+ case (
+ Language.EN
+ | Language.EN_US
+ | Language.EN_AU
+ | Language.EN_GB
+ | Language.EN_NZ
+ | Language.EN_IN
+ ):
+ return "en"
+ case Language.FI:
+ return "fi"
+ case Language.FR | Language.FR_CA:
+ return "fr"
+ case Language.DE | Language.DE_CH:
+ return "de"
+ case Language.EL:
+ return "el"
+ case Language.HI:
+ return "hi"
+ case Language.HU:
+ return "hu"
+ case Language.ID:
+ return "id"
+ case Language.IT:
+ return "it"
+ case Language.JA:
+ return "ja"
+ case Language.KO:
+ return "ko"
+ case Language.MS:
+ return "ms"
+ case Language.NO:
+ return "no"
+ case Language.PL:
+ return "pl"
+ case Language.PT:
+ return "pt-PT"
+ case Language.PT_BR:
+ return "pt-BR"
+ case Language.RO:
+ return "ro"
+ case Language.RU:
+ return "ru"
+ case Language.SK:
+ return "sk"
+ case Language.ES:
+ return "es"
+ case Language.SV:
+ return "sv"
+ case Language.TR:
+ return "tr"
+ case Language.UK:
+ return "uk"
+ case Language.VI:
+ return "vi"
+ return None
+
+
def calculate_word_times(
alignment_info: Mapping[str, Any], cumulative_time: float
) -> List[Tuple[str, float]]:
@@ -72,7 +143,7 @@ def calculate_word_times(
class ElevenLabsTTSService(WordTTSService):
class InputParams(BaseModel):
- language: Optional[str] = None
+ language: Optional[Language] = Language.EN
output_format: Literal["pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100"] = "pcm_16000"
optimize_streaming_latency: Optional[str] = None
stability: Optional[float] = None
@@ -124,10 +195,21 @@ def __init__(
)
self._api_key = api_key
- self._voice_id = voice_id
- self.set_model_name(model)
self._url = url
- self._params = params
+ self._settings = {
+ "sample_rate": sample_rate_from_output_format(params.output_format),
+ "language": language_to_elevenlabs_language(params.language)
+ if params.language
+ else "en",
+ "output_format": params.output_format,
+ "optimize_streaming_latency": params.optimize_streaming_latency,
+ "stability": params.stability,
+ "similarity_boost": params.similarity_boost,
+ "style": params.style,
+ "use_speaker_boost": params.use_speaker_boost,
+ }
+ self.set_model_name(model)
+ self.set_voice(voice_id)
self._voice_settings = self._set_voice_settings()
# Websocket connection to ElevenLabs.
@@ -142,19 +224,22 @@ def can_generate_metrics(self) -> bool:
def _set_voice_settings(self):
voice_settings = {}
- if self._params.stability is not None and self._params.similarity_boost is not None:
- voice_settings["stability"] = self._params.stability
- voice_settings["similarity_boost"] = self._params.similarity_boost
- if self._params.style is not None:
- voice_settings["style"] = self._params.style
- if self._params.use_speaker_boost is not None:
- voice_settings["use_speaker_boost"] = self._params.use_speaker_boost
+ if (
+ self._settings["stability"] is not None
+ and self._settings["similarity_boost"] is not None
+ ):
+ voice_settings["stability"] = self._settings["stability"]
+ voice_settings["similarity_boost"] = self._settings["similarity_boost"]
+ if self._settings["style"] is not None:
+ voice_settings["style"] = self._settings["style"]
+ if self._settings["use_speaker_boost"] is not None:
+ voice_settings["use_speaker_boost"] = self._settings["use_speaker_boost"]
else:
- if self._params.style is not None:
+ if self._settings["style"] is not None:
logger.warning(
"'style' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)
- if self._params.use_speaker_boost is not None:
+ if self._settings["use_speaker_boost"] is not None:
logger.warning(
"'use_speaker_boost' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)
@@ -167,33 +252,13 @@ async def set_model(self, model: str):
await self._disconnect()
await self._connect()
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
- await self._disconnect()
- await self._connect()
-
- async def set_voice_settings(
- self,
- stability: Optional[float] = None,
- similarity_boost: Optional[float] = None,
- style: Optional[float] = None,
- use_speaker_boost: Optional[bool] = None,
- ):
- self._params.stability = stability if stability is not None else self._params.stability
- self._params.similarity_boost = (
- similarity_boost if similarity_boost is not None else self._params.similarity_boost
- )
- self._params.style = style if style is not None else self._params.style
- self._params.use_speaker_boost = (
- use_speaker_boost if use_speaker_boost is not None else self._params.use_speaker_boost
- )
-
- self._set_voice_settings()
-
- if self._websocket:
- msg = {"voice_settings": self._voice_settings}
- await self._websocket.send(json.dumps(msg))
+ async def _update_settings(self, settings: Dict[str, Any]):
+ prev_voice = self._voice_id
+ await super()._update_settings(settings)
+ if not prev_voice == self._voice_id:
+ await self._disconnect()
+ await self._connect()
+ logger.debug(f"Switching TTS voice to: [{self._voice_id}]")
async def start(self, frame: StartFrame):
await super().start(frame)
@@ -223,19 +288,19 @@ async def _connect(self):
try:
voice_id = self._voice_id
model = self.model_name
- output_format = self._params.output_format
+ output_format = self._settings["output_format"]
url = f"{self._url}/v1/text-to-speech/{voice_id}/stream-input?model_id={model}&output_format={output_format}"
- if self._params.optimize_streaming_latency:
- url += f"&optimize_streaming_latency={self._params.optimize_streaming_latency}"
+ if self._settings["optimize_streaming_latency"]:
+ url += f"&optimize_streaming_latency={self._settings["optimize_streaming_latency"]}"
# language can only be used with the 'eleven_turbo_v2_5' model
- if self._params.language:
+ if self._settings["language"]:
if model == "eleven_turbo_v2_5":
- url += f"&language_code={self._params.language}"
+ url += f"&language_code={self._settings["language"]}"
else:
logger.debug(
- f"Language code [{self._params.language}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
+ f"Language code [{self._settings["language"]}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
)
self._websocket = await websockets.connect(url)
@@ -286,7 +351,7 @@ async def _receive_task_handler(self):
self.start_word_timestamps()
audio = base64.b64decode(msg["audio"])
- frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
+ frame = TTSAudioRawFrame(audio, self._settings["sample_rate"], 1)
await self.push_frame(frame)
if msg.get("alignment"):
diff --git a/src/pipecat/services/gladia.py b/src/pipecat/services/gladia.py
index a590d73cf..0938a1cee 100644
--- a/src/pipecat/services/gladia.py
+++ b/src/pipecat/services/gladia.py
@@ -6,8 +6,9 @@
import base64
import json
-
from typing import AsyncGenerator, Optional
+
+from loguru import logger
from pydantic.main import BaseModel
from pipecat.frames.frames import (
@@ -19,10 +20,9 @@
TranscriptionFrame,
)
from pipecat.services.ai_services import STTService
+from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
-from loguru import logger
-
# See .env.example for Gladia configuration needed
try:
import websockets
@@ -34,10 +34,88 @@
raise Exception(f"Missing module: {e}")
+def language_to_gladia_language(language: Language) -> str | None:
+ match language:
+ case Language.BG:
+ return "bg"
+ case Language.CA:
+ return "ca"
+ case Language.ZH:
+ return "zh"
+ case Language.CS:
+ return "cs"
+ case Language.DA:
+ return "da"
+ case Language.NL:
+ return "nl"
+ case (
+ Language.EN
+ | Language.EN_US
+ | Language.EN_AU
+ | Language.EN_GB
+ | Language.EN_NZ
+ | Language.EN_IN
+ ):
+ return "en"
+ case Language.ET:
+ return "et"
+ case Language.FI:
+ return "fi"
+ case Language.FR | Language.FR_CA:
+ return "fr"
+ case Language.DE | Language.DE_CH:
+ return "de"
+ case Language.EL:
+ return "el"
+ case Language.HI:
+ return "hi"
+ case Language.HU:
+ return "hu"
+ case Language.ID:
+ return "id"
+ case Language.IT:
+ return "it"
+ case Language.JA:
+ return "ja"
+ case Language.KO:
+ return "ko"
+ case Language.LV:
+ return "lv"
+ case Language.LT:
+ return "lt"
+ case Language.MS:
+ return "ms"
+ case Language.NO:
+ return "no"
+ case Language.PL:
+ return "pl"
+ case Language.PT | Language.PT_BR:
+ return "pt"
+ case Language.RO:
+ return "ro"
+ case Language.RU:
+ return "ru"
+ case Language.SK:
+ return "sk"
+ case Language.ES:
+ return "es"
+ case Language.SV:
+ return "sv"
+ case Language.TH:
+ return "th"
+ case Language.TR:
+ return "tr"
+ case Language.UK:
+ return "uk"
+ case Language.VI:
+ return "vi"
+ return None
+
+
class GladiaSTTService(STTService):
class InputParams(BaseModel):
sample_rate: Optional[int] = 16000
- language: Optional[str] = "english"
+ language: Optional[Language] = Language.EN
transcription_hint: Optional[str] = None
endpointing: Optional[int] = 200
prosody: Optional[bool] = None
@@ -55,7 +133,13 @@ def __init__(
self._api_key = api_key
self._url = url
- self._params = params
+ self._settings = {
+ "sample_rate": params.sample_rate,
+ "language": language_to_gladia_language(params.language) if params.language else "en",
+ "transcription_hint": params.transcription_hint,
+ "endpointing": params.endpointing,
+ "prosody": params.prosody,
+ }
self._confidence = confidence
async def start(self, frame: StartFrame):
@@ -84,7 +168,11 @@ async def _setup_gladia(self):
"encoding": "WAV/PCM",
"model_type": "fast",
"language_behaviour": "manual",
- **self._params.model_dump(exclude_none=True),
+ "sample_rate": self._settings["sample_rate"],
+ "language": self._settings["language"],
+ "transcription_hint": self._settings["transcription_hint"],
+ "endpointing": self._settings["endpointing"],
+ "prosody": self._settings["prosody"],
}
await self._websocket.send(json.dumps(configuration))
diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py
index 519f47028..55a2576d2 100644
--- a/src/pipecat/services/google.py
+++ b/src/pipecat/services/google.py
@@ -30,6 +30,7 @@
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import LLMService, TTSService
+from pipecat.transcriptions.language import Language
try:
import google.ai.generativelanguage as glm
@@ -137,9 +138,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
elif isinstance(frame, VisionImageRawFrame):
context = OpenAILLMContext.from_image_frame(frame)
elif isinstance(frame, LLMUpdateSettingsFrame):
- if frame.model is not None:
- logger.debug(f"Switching LLM model to: [{frame.model}]")
- self.set_model_name(frame.model)
+ await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
@@ -147,13 +146,100 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
await self._process_context(context)
+def language_to_google_language(language: Language) -> str | None:
+ match language:
+ case Language.BG:
+ return "bg-BG"
+ case Language.CA:
+ return "ca-ES"
+ case Language.ZH:
+ return "cmn-CN"
+ case Language.ZH_TW:
+ return "cmn-TW"
+ case Language.CS:
+ return "cs-CZ"
+ case Language.DA:
+ return "da-DK"
+ case Language.NL:
+ return "nl-NL"
+ case Language.EN:
+ return "en-US"
+ case Language.EN_US:
+ return "en-US"
+ case Language.EN_AU:
+ return "en-AU"
+ case Language.EN_GB:
+ return "en-GB"
+ case Language.EN_IN:
+ return "en-IN"
+ case Language.ET:
+ return "et-EE"
+ case Language.FI:
+ return "fi-FI"
+ case Language.NL_BE:
+ return "nl-BE"
+ case Language.FR:
+ return "fr-FR"
+ case Language.FR_CA:
+ return "fr-CA"
+ case Language.DE:
+ return "de-DE"
+ case Language.EL:
+ return "el-GR"
+ case Language.HI:
+ return "hi-IN"
+ case Language.HU:
+ return "hu-HU"
+ case Language.ID:
+ return "id-ID"
+ case Language.IT:
+ return "it-IT"
+ case Language.JA:
+ return "ja-JP"
+ case Language.KO:
+ return "ko-KR"
+ case Language.LV:
+ return "lv-LV"
+ case Language.LT:
+ return "lt-LT"
+ case Language.MS:
+ return "ms-MY"
+ case Language.NO:
+ return "nb-NO"
+ case Language.PL:
+ return "pl-PL"
+ case Language.PT:
+ return "pt-PT"
+ case Language.PT_BR:
+ return "pt-BR"
+ case Language.RO:
+ return "ro-RO"
+ case Language.RU:
+ return "ru-RU"
+ case Language.SK:
+ return "sk-SK"
+ case Language.ES:
+ return "es-ES"
+ case Language.SV:
+ return "sv-SE"
+ case Language.TH:
+ return "th-TH"
+ case Language.TR:
+ return "tr-TR"
+ case Language.UK:
+ return "uk-UA"
+ case Language.VI:
+ return "vi-VN"
+ return None
+
+
class GoogleTTSService(TTSService):
class InputParams(BaseModel):
pitch: Optional[str] = None
rate: Optional[str] = None
volume: Optional[str] = None
emphasis: Optional[Literal["strong", "moderate", "reduced", "none"]] = None
- language: Optional[str] = "en-US"
+ language: Optional[Language] = Language.EN
gender: Optional[Literal["male", "female", "neutral"]] = None
google_style: Optional[Literal["apologetic", "calm", "empathetic", "firm", "lively"]] = None
@@ -169,8 +255,19 @@ def __init__(
):
super().__init__(sample_rate=sample_rate, **kwargs)
- self._voice_id: str = voice_id
- self._params = params
+ self._settings = {
+ "sample_rate": sample_rate,
+ "pitch": params.pitch,
+ "rate": params.rate,
+ "volume": params.volume,
+ "emphasis": params.emphasis,
+ "language": language_to_google_language(params.language)
+ if params.language
+ else "en-US",
+ "gender": params.gender,
+ "google_style": params.google_style,
+ }
+ self.set_voice(voice_id)
self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
credentials, credentials_path
)
@@ -203,38 +300,38 @@ def _construct_ssml(self, text: str) -> str:
# Voice tag
voice_attrs = [f"name='{self._voice_id}'"]
- if self._params.language:
- voice_attrs.append(f"language='{self._params.language}'")
- if self._params.gender:
- voice_attrs.append(f"gender='{self._params.gender}'")
+ if self._settings["language"]:
+ voice_attrs.append(f"language='{self._settings['language']}'")
+ if self._settings["gender"]:
+ voice_attrs.append(f"gender='{self._settings['gender']}'")
ssml += f""
# Prosody tag
prosody_attrs = []
- if self._params.pitch:
- prosody_attrs.append(f"pitch='{self._params.pitch}'")
- if self._params.rate:
- prosody_attrs.append(f"rate='{self._params.rate}'")
- if self._params.volume:
- prosody_attrs.append(f"volume='{self._params.volume}'")
+ if self._settings["pitch"]:
+ prosody_attrs.append(f"pitch='{self._settings['pitch']}'")
+ if self._settings["rate"]:
+ prosody_attrs.append(f"rate='{self._settings['rate']}'")
+ if self._settings["volume"]:
+ prosody_attrs.append(f"volume='{self._settings['volume']}'")
if prosody_attrs:
ssml += f""
# Emphasis tag
- if self._params.emphasis:
- ssml += f""
+ if self._settings["emphasis"]:
+ ssml += f""
# Google style tag
- if self._params.google_style:
- ssml += f""
+ if self._settings["google_style"]:
+ ssml += f""
ssml += text
# Close tags
- if self._params.google_style:
+ if self._settings["google_style"]:
ssml += ""
- if self._params.emphasis:
+ if self._settings["emphasis"]:
ssml += ""
if prosody_attrs:
ssml += ""
@@ -242,46 +339,6 @@ def _construct_ssml(self, text: str) -> str:
return ssml
- async def set_voice(self, voice: str) -> None:
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
- async def set_language(self, language: str) -> None:
- logger.debug(f"Switching TTS language to: [{language}]")
- self._params.language = language
-
- async def set_pitch(self, pitch: str) -> None:
- logger.debug(f"Switching TTS pitch to: [{pitch}]")
- self._params.pitch = pitch
-
- async def set_rate(self, rate: str) -> None:
- logger.debug(f"Switching TTS rate to: [{rate}]")
- self._params.rate = rate
-
- async def set_volume(self, volume: str) -> None:
- logger.debug(f"Switching TTS volume to: [{volume}]")
- self._params.volume = volume
-
- async def set_emphasis(
- self, emphasis: Literal["strong", "moderate", "reduced", "none"]
- ) -> None:
- logger.debug(f"Switching TTS emphasis to: [{emphasis}]")
- self._params.emphasis = emphasis
-
- async def set_gender(self, gender: Literal["male", "female", "neutral"]) -> None:
- logger.debug(f"Switch TTS gender to [{gender}]")
- self._params.gender = gender
-
- async def google_style(
- self, google_style: Literal["apologetic", "calm", "empathetic", "firm", "lively"]
- ) -> None:
- logger.debug(f"Switching TTS google style to: [{google_style}]")
- self._params.google_style = google_style
-
- async def set_params(self, params: InputParams) -> None:
- logger.debug(f"Switching TTS params to: [{params}]")
- self._params = params
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -291,11 +348,11 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
ssml = self._construct_ssml(text)
synthesis_input = texttospeech_v1.SynthesisInput(ssml=ssml)
voice = texttospeech_v1.VoiceSelectionParams(
- language_code=self._params.language, name=self._voice_id
+ language_code=self._settings["language"], name=self._voice_id
)
audio_config = texttospeech_v1.AudioConfig(
audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16,
- sample_rate_hertz=self.sample_rate,
+ sample_rate_hertz=self._settings["sample_rate"],
)
request = texttospeech_v1.SynthesizeSpeechRequest(
@@ -318,7 +375,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
if not chunk:
break
await self.stop_ttfb_metrics()
- frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
+ frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await asyncio.sleep(0) # Allow other tasks to run
diff --git a/src/pipecat/services/lmnt.py b/src/pipecat/services/lmnt.py
index 8f18002c5..1ea3f1e62 100644
--- a/src/pipecat/services/lmnt.py
+++ b/src/pipecat/services/lmnt.py
@@ -5,10 +5,10 @@
#
import asyncio
-
from typing import AsyncGenerator
-from pipecat.processors.frame_processor import FrameDirection
+from loguru import logger
+
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
@@ -20,9 +20,9 @@
TTSStartedFrame,
TTSStoppedFrame,
)
+from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import TTSService
-
-from loguru import logger
+from pipecat.transcriptions.language import Language
# See .env.example for LMNT configuration needed
try:
@@ -35,6 +35,32 @@
raise Exception(f"Missing module: {e}")
+def language_to_lmnt_language(language: Language) -> str | None:
+ match language:
+ case Language.DE:
+ return "de"
+ case (
+ Language.EN
+ | Language.EN_US
+ | Language.EN_AU
+ | Language.EN_GB
+ | Language.EN_NZ
+ | Language.EN_IN
+ ):
+ return "en"
+ case Language.ES:
+ return "es"
+ case Language.FR | Language.FR_CA:
+ return "fr"
+ case Language.PT | Language.PT_BR:
+ return "pt"
+ case Language.ZH | Language.ZH_TW:
+ return "zh"
+ case Language.KO:
+ return "ko"
+ return None
+
+
class LmntTTSService(TTSService):
def __init__(
self,
@@ -42,7 +68,7 @@ def __init__(
api_key: str,
voice_id: str,
sample_rate: int = 24000,
- language: str = "en",
+ language: Language = Language.EN,
**kwargs,
):
# Let TTSService produce TTSStoppedFrames after a short delay of
@@ -50,13 +76,16 @@ def __init__(
super().__init__(push_stop_frames=True, sample_rate=sample_rate, **kwargs)
self._api_key = api_key
- self._voice_id = voice_id
- self._output_format = {
- "container": "raw",
- "encoding": "pcm_s16le",
- "sample_rate": sample_rate,
+ self._settings = {
+ "output_format": {
+ "container": "raw",
+ "encoding": "pcm_s16le",
+ "sample_rate": sample_rate,
+ },
+ "language": language_to_lmnt_language(language) if language else "en",
}
- self._language = language
+
+ self.set_voice(voice_id)
self._speech = None
self._connection = None
@@ -68,10 +97,6 @@ def __init__(
def can_generate_metrics(self) -> bool:
return True
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
async def start(self, frame: StartFrame):
await super().start(frame)
await self._connect()
@@ -93,7 +118,9 @@ async def _connect(self):
try:
self._speech = Speech()
self._connection = await self._speech.synthesize_streaming(
- self._voice_id, format="raw", sample_rate=self._output_format["sample_rate"]
+ self._voice_id,
+ format="raw",
+ sample_rate=self._settings["output_format"]["sample_rate"],
)
self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
except Exception as e:
@@ -130,7 +157,7 @@ async def _receive_task_handler(self):
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(
audio=msg["audio"],
- sample_rate=self._output_format["sample_rate"],
+ sample_rate=self._settings["output_format"]["sample_rate"],
num_channels=1,
)
await self.push_frame(frame)
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index 9a7cc9023..24d673234 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -111,14 +111,16 @@ def __init__(
**kwargs,
):
super().__init__(**kwargs)
+ self._settings = {
+ "frequency_penalty": params.frequency_penalty,
+ "presence_penalty": params.presence_penalty,
+ "seed": params.seed,
+ "temperature": params.temperature,
+ "top_p": params.top_p,
+ "extra": params.extra if isinstance(params.extra, dict) else {},
+ }
self.set_model_name(model)
self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs)
- self._frequency_penalty = params.frequency_penalty
- self._presence_penalty = params.presence_penalty
- self._seed = params.seed
- self._temperature = params.temperature
- self._top_p = params.top_p
- self._extra = params.extra if isinstance(params.extra, dict) else {}
def create_client(self, api_key=None, base_url=None, **kwargs):
return AsyncOpenAI(
@@ -134,30 +136,6 @@ def create_client(self, api_key=None, base_url=None, **kwargs):
def can_generate_metrics(self) -> bool:
return True
- async def set_frequency_penalty(self, frequency_penalty: float):
- logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]")
- self._frequency_penalty = frequency_penalty
-
- async def set_presence_penalty(self, presence_penalty: float):
- logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]")
- self._presence_penalty = presence_penalty
-
- async def set_seed(self, seed: int):
- logger.debug(f"Switching LLM seed to: [{seed}]")
- self._seed = seed
-
- async def set_temperature(self, temperature: float):
- logger.debug(f"Switching LLM temperature to: [{temperature}]")
- self._temperature = temperature
-
- async def set_top_p(self, top_p: float):
- logger.debug(f"Switching LLM top_p to: [{top_p}]")
- self._top_p = top_p
-
- async def set_extra(self, extra: Dict[str, Any]):
- logger.debug(f"Switching LLM extra to: [{extra}]")
- self._extra = extra
-
async def get_chat_completions(
self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
) -> AsyncStream[ChatCompletionChunk]:
@@ -168,14 +146,14 @@ async def get_chat_completions(
"tools": context.tools,
"tool_choice": context.tool_choice,
"stream_options": {"include_usage": True},
- "frequency_penalty": self._frequency_penalty,
- "presence_penalty": self._presence_penalty,
- "seed": self._seed,
- "temperature": self._temperature,
- "top_p": self._top_p,
+ "frequency_penalty": self._settings["frequency_penalty"],
+ "presence_penalty": self._settings["presence_penalty"],
+ "seed": self._settings["seed"],
+ "temperature": self._settings["temperature"],
+ "top_p": self._settings["top_p"],
}
- params.update(self._extra)
+ params.update(self._settings["extra"])
chunks = await self._client.chat.completions.create(**params)
return chunks
@@ -295,23 +273,6 @@ async def _process_context(self, context: OpenAILLMContext):
f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
)
- async def _update_settings(self, frame: LLMUpdateSettingsFrame):
- if frame.model is not None:
- logger.debug(f"Switching LLM model to: [{frame.model}]")
- self.set_model_name(frame.model)
- if frame.frequency_penalty is not None:
- await self.set_frequency_penalty(frame.frequency_penalty)
- if frame.presence_penalty is not None:
- await self.set_presence_penalty(frame.presence_penalty)
- if frame.seed is not None:
- await self.set_seed(frame.seed)
- if frame.temperature is not None:
- await self.set_temperature(frame.temperature)
- if frame.top_p is not None:
- await self.set_top_p(frame.top_p)
- if frame.extra:
- await self.set_extra(frame.extra)
-
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -323,7 +284,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
elif isinstance(frame, VisionImageRawFrame):
context = OpenAILLMContext.from_image_frame(frame)
elif isinstance(frame, LLMUpdateSettingsFrame):
- await self._update_settings(frame)
+ await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
@@ -427,22 +388,20 @@ def __init__(
):
super().__init__(sample_rate=sample_rate, **kwargs)
- self._voice: ValidVoice = VALID_VOICES.get(voice, "alloy")
+ self._settings = {
+ "sample_rate": sample_rate,
+ }
self.set_model_name(model)
- self._sample_rate = sample_rate
+ self.set_voice(voice)
self._client = AsyncOpenAI(api_key=api_key)
def can_generate_metrics(self) -> bool:
return True
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice = VALID_VOICES.get(voice, self._voice)
-
async def set_model(self, model: str):
logger.debug(f"Switching TTS model to: [{model}]")
- self._model = model
+ self.set_model_name(model)
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -452,7 +411,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
async with self._client.audio.speech.with_streaming_response.create(
input=text,
model=self.model_name,
- voice=self._voice,
+ voice=VALID_VOICES[self._voice_id],
response_format="pcm",
) as r:
if r.status_code != 200:
@@ -471,7 +430,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
async for chunk in r.iter_bytes(8192):
if len(chunk) > 0:
await self.stop_ttfb_metrics()
- frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
+ frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
except BadRequestError as e:
diff --git a/src/pipecat/services/playht.py b/src/pipecat/services/playht.py
index 2ffa3a419..fe5feba3a 100644
--- a/src/pipecat/services/playht.py
+++ b/src/pipecat/services/playht.py
@@ -6,17 +6,21 @@
import io
import struct
-
from typing import AsyncGenerator
-from pipecat.frames.frames import Frame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame
-from pipecat.services.ai_services import TTSService
-
from loguru import logger
+from pipecat.frames.frames import (
+ Frame,
+ TTSAudioRawFrame,
+ TTSStartedFrame,
+ TTSStoppedFrame,
+)
+from pipecat.services.ai_services import TTSService
+
try:
- from pyht.client import TTSOptions
from pyht.async_client import AsyncClient
+ from pyht.client import TTSOptions
from pyht.protos.api_pb2 import Format
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
@@ -39,17 +43,23 @@ def __init__(
user_id=self._user_id,
api_key=self._speech_key,
)
+ self._settings = {
+ "sample_rate": sample_rate,
+ "quality": "higher",
+ "format": Format.FORMAT_WAV,
+ "voice_engine": "PlayHT2.0-turbo",
+ }
+ self.set_voice(voice_url)
self._options = TTSOptions(
- voice=voice_url, sample_rate=sample_rate, quality="higher", format=Format.FORMAT_WAV
+ voice=self._voice_id,
+ sample_rate=self._settings["sample_rate"],
+ quality=self._settings["quality"],
+ format=self._settings["format"],
)
def can_generate_metrics(self) -> bool:
return True
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._options.voice = voice
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -60,7 +70,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
await self.start_ttfb_metrics()
playht_gen = self._client.tts(
- text, voice_engine="PlayHT2.0-turbo", options=self._options
+ text, voice_engine=self._settings["voice_engine"], options=self._options
)
await self.start_tts_usage_metrics(text)
@@ -83,7 +93,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
else:
if len(chunk):
await self.stop_ttfb_metrics()
- frame = TTSAudioRawFrame(chunk, 16000, 1)
+ frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
except Exception as e:
diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py
index 5da470002..1bf74e508 100644
--- a/src/pipecat/services/together.py
+++ b/src/pipecat/services/together.py
@@ -5,16 +5,13 @@
#
from typing import Any, Dict, Optional
+
import httpx
from loguru import logger
from pydantic import BaseModel, Field
-from pipecat.frames.frames import (
- LLMUpdateSettingsFrame,
-)
from pipecat.services.openai import OpenAILLMService
-
try:
# Together.ai is recommending OpenAI-compatible function calling, so we've switched over
# to using the OpenAI client library here rather than the Together Python client library.
@@ -53,13 +50,15 @@ def __init__(
):
super().__init__(api_key=api_key, base_url=base_url, model=model, params=params, **kwargs)
self.set_model_name(model)
- self._max_tokens = params.max_tokens
- self._frequency_penalty = params.frequency_penalty
- self._presence_penalty = params.presence_penalty
- self._temperature = params.temperature
- self._top_k = params.top_k
- self._top_p = params.top_p
- self._extra = params.extra if isinstance(params.extra, dict) else {}
+ self._settings = {
+ "max_tokens": params.max_tokens,
+ "frequency_penalty": params.frequency_penalty,
+ "presence_penalty": params.presence_penalty,
+ "seed": params.seed,
+ "temperature": params.temperature,
+ "top_p": params.top_p,
+ "extra": params.extra if isinstance(params.extra, dict) else {},
+ }
def can_generate_metrics(self) -> bool:
return True
@@ -75,50 +74,3 @@ def create_client(self, api_key=None, base_url=None, **kwargs):
)
),
)
-
- async def set_frequency_penalty(self, frequency_penalty: float):
- logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]")
- self._frequency_penalty = frequency_penalty
-
- async def set_max_tokens(self, max_tokens: int):
- logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]")
- self._max_tokens = max_tokens
-
- async def set_presence_penalty(self, presence_penalty: float):
- logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]")
- self._presence_penalty = presence_penalty
-
- async def set_temperature(self, temperature: float):
- logger.debug(f"Switching LLM temperature to: [{temperature}]")
- self._temperature = temperature
-
- async def set_top_k(self, top_k: float):
- logger.debug(f"Switching LLM top_k to: [{top_k}]")
- self._top_k = top_k
-
- async def set_top_p(self, top_p: float):
- logger.debug(f"Switching LLM top_p to: [{top_p}]")
- self._top_p = top_p
-
- async def set_extra(self, extra: Dict[str, Any]):
- logger.debug(f"Switching LLM extra to: [{extra}]")
- self._extra = extra
-
- async def _update_settings(self, frame: LLMUpdateSettingsFrame):
- if frame.model is not None:
- logger.debug(f"Switching LLM model to: [{frame.model}]")
- self.set_model_name(frame.model)
- if frame.frequency_penalty is not None:
- await self.set_frequency_penalty(frame.frequency_penalty)
- if frame.max_tokens is not None:
- await self.set_max_tokens(frame.max_tokens)
- if frame.presence_penalty is not None:
- await self.set_presence_penalty(frame.presence_penalty)
- if frame.temperature is not None:
- await self.set_temperature(frame.temperature)
- if frame.top_k is not None:
- await self.set_top_k(frame.top_k)
- if frame.top_p is not None:
- await self.set_top_p(frame.top_p)
- if frame.extra:
- await self.set_extra(frame.extra)
diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py
index 2c47d59e8..eb20d5f3c 100644
--- a/src/pipecat/services/xtts.py
+++ b/src/pipecat/services/xtts.py
@@ -4,10 +4,12 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
-import aiohttp
-
from typing import Any, AsyncGenerator, Dict
+import aiohttp
+import numpy as np
+from loguru import logger
+
from pipecat.frames.frames import (
ErrorFrame,
Frame,
@@ -17,10 +19,7 @@
TTSStoppedFrame,
)
from pipecat.services.ai_services import TTSService
-
-import numpy as np
-
-from loguru import logger
+from pipecat.transcriptions.language import Language
try:
import resampy
@@ -38,21 +37,67 @@
# https://github.com/coqui-ai/xtts-streaming-server
+def language_to_xtts_language(language: Language) -> str | None:
+ match language:
+ case Language.CS:
+ return "cs"
+ case Language.DE:
+ return "de"
+ case (
+ Language.EN
+ | Language.EN_US
+ | Language.EN_AU
+ | Language.EN_GB
+ | Language.EN_NZ
+ | Language.EN_IN
+ ):
+ return "en"
+ case Language.ES:
+ return "es"
+ case Language.FR:
+ return "fr"
+ case Language.HI:
+ return "hi"
+ case Language.HU:
+ return "hu"
+ case Language.IT:
+ return "it"
+ case Language.JA:
+ return "ja"
+ case Language.KO:
+ return "ko"
+ case Language.NL:
+ return "nl"
+ case Language.PL:
+ return "pl"
+ case Language.PT | Language.PT_BR:
+ return "pt"
+ case Language.RU:
+ return "ru"
+ case Language.TR:
+ return "tr"
+ case Language.ZH:
+ return "zh-cn"
+ return None
+
+
class XTTSService(TTSService):
def __init__(
self,
*,
voice_id: str,
- language: str,
+ language: Language,
base_url: str,
aiohttp_session: aiohttp.ClientSession,
**kwargs,
):
super().__init__(**kwargs)
- self._voice_id = voice_id
- self._language = language
- self._base_url = base_url
+ self._settings = {
+ "language": language_to_xtts_language(language) if language else "en",
+ "base_url": base_url,
+ }
+ self.set_voice(voice_id)
self._studio_speakers: Dict[str, Any] | None = None
self._aiohttp_session = aiohttp_session
@@ -61,7 +106,7 @@ def can_generate_metrics(self) -> bool:
async def start(self, frame: StartFrame):
await super().start(frame)
- async with self._aiohttp_session.get(self._base_url + "/studio_speakers") as r:
+ async with self._aiohttp_session.get(self._settings["base_url"] + "/studio_speakers") as r:
if r.status != 200:
text = await r.text()
logger.error(
@@ -75,10 +120,6 @@ async def start(self, frame: StartFrame):
return
self._studio_speakers = await r.json()
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -88,11 +129,11 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
embeddings = self._studio_speakers[self._voice_id]
- url = self._base_url + "/tts_stream"
+ url = self._settings["base_url"] + "/tts_stream"
payload = {
"text": text.replace(".", "").replace("*", ""),
- "language": self._language,
+ "language": self._settings["language"],
"speaker_embedding": embeddings["speaker_embedding"],
"gpt_cond_latent": embeddings["gpt_cond_latent"],
"add_wav_header": False,