diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index 7cb93606d..e2ef78df5 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -530,10 +530,24 @@ def __str__(self):
class ServiceUpdateSettingsFrame(ControlFrame):
"""A control frame containing a request to update service settings."""
- service_type: str
settings: Dict[str, Any]
+@dataclass
+class LLMUpdateSettingsFrame(ServiceUpdateSettingsFrame):
+ pass
+
+
+@dataclass
+class TTSUpdateSettingsFrame(ServiceUpdateSettingsFrame):
+ pass
+
+
+@dataclass
+class STTUpdateSettingsFrame(ServiceUpdateSettingsFrame):
+ pass
+
+
@dataclass
class FunctionCallInProgressFrame(SystemFrame):
"""A frame signaling that a function call is in progress."""
diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index 6ddc4d426..dd2b2cfd8 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -8,7 +8,7 @@
import io
import wave
from abc import abstractmethod
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
from loguru import logger
@@ -19,14 +19,15 @@
ErrorFrame,
Frame,
LLMFullResponseEndFrame,
- ServiceUpdateSettingsFrame,
StartFrame,
StartInterruptionFrame,
+ STTUpdateSettingsFrame,
TextFrame,
TTSAudioRawFrame,
TTSSpeakFrame,
TTSStartedFrame,
TTSStoppedFrame,
+ TTSUpdateSettingsFrame,
UserImageRequestFrame,
VisionImageRawFrame,
)
@@ -183,62 +184,6 @@ def sample_rate(self) -> int:
async def set_model(self, model: str):
self.set_model_name(model)
- @abstractmethod
- async def set_voice(self, voice: str):
- pass
-
- @abstractmethod
- async def set_language(self, language: Language):
- pass
-
- @abstractmethod
- async def set_speed(self, speed: Union[str, float]):
- pass
-
- @abstractmethod
- async def set_emotion(self, emotion: List[str]):
- pass
-
- @abstractmethod
- async def set_engine(self, engine: str):
- pass
-
- @abstractmethod
- async def set_pitch(self, pitch: str):
- pass
-
- @abstractmethod
- async def set_rate(self, rate: str):
- pass
-
- @abstractmethod
- async def set_volume(self, volume: str):
- pass
-
- @abstractmethod
- async def set_emphasis(self, emphasis: str):
- pass
-
- @abstractmethod
- async def set_style(self, style: str):
- pass
-
- @abstractmethod
- async def set_style_degree(self, style_degree: str):
- pass
-
- @abstractmethod
- async def set_role(self, role: str):
- pass
-
- @abstractmethod
- async def set_gender(self, gender: str):
- pass
-
- @abstractmethod
- async def set_google_style(self, google_style: str):
- pass
-
@abstractmethod
async def flush_audio(self):
pass
@@ -269,20 +214,16 @@ async def cancel(self, frame: CancelFrame):
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
- setter = getattr(self, f"set_{key}", None)
- if setter and callable(setter):
- try:
- if key == "language":
- await setter(Language(value))
- else:
- await setter(value)
- except Exception as e:
- logger.warning(f"Error setting {key}: {e}")
+ if key in self._settings:
+ logger.debug(f"Updating TTS setting {key} to: [{value}]")
+ self._settings[key] = value
+ if key == "model":
+ self.set_model_name(value)
+ elif key == "language":
+ self._settings[key] = Language(value)
else:
logger.warning(f"Unknown setting for TTS service: {key}")
- self._settings.update(settings)
-
async def say(self, text: str):
aggregate_sentences = self._aggregate_sentences
self._aggregate_sentences = False
@@ -309,7 +250,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
elif isinstance(frame, TTSSpeakFrame):
await self._push_tts_frames(frame.text)
await self.flush_audio()
- elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "tts":
+ elif isinstance(frame, TTSUpdateSettingsFrame):
await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
@@ -448,10 +389,6 @@ def __init__(self, **kwargs):
async def set_model(self, model: str):
self.set_model_name(model)
- @abstractmethod
- async def set_language(self, language: Language):
- pass
-
@abstractmethod
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
"""Returns transcript as a string"""
@@ -459,20 +396,16 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
- setter = getattr(self, f"set_{key}", None)
- if setter and callable(setter):
- try:
- if key == "language":
- await setter(Language(value))
- else:
- await setter(value)
- except Exception as e:
- logger.warning(f"Error setting {key}: {e}")
+ if key in self._settings:
+ logger.debug(f"Updating STT setting {key} to: [{value}]")
+ self._settings[key] = value
+ if key == "model":
+ self.set_model_name(value)
+ elif key == "language":
+ self._settings[key] = Language(value)
else:
logger.warning(f"Unknown setting for STT service: {key}")
- self._settings.update(settings)
-
async def process_audio_frame(self, frame: AudioRawFrame):
await self.process_generator(self.run_stt(frame.audio))
@@ -484,7 +417,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
# In this service we accumulate audio internally and at the end we
# push a TextFrame. We don't really want to push audio frames down.
await self.process_audio_frame(frame)
- elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "stt":
+ elif isinstance(frame, STTUpdateSettingsFrame):
await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py
index 09d6c5402..8b4253c3e 100644
--- a/src/pipecat/services/anthropic.py
+++ b/src/pipecat/services/anthropic.py
@@ -25,7 +25,7 @@
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMMessagesFrame,
- ServiceUpdateSettingsFrame,
+ LLMUpdateSettingsFrame,
StartInterruptionFrame,
TextFrame,
UserImageRawFrame,
@@ -96,12 +96,15 @@ def __init__(
super().__init__(**kwargs)
self._client = AsyncAnthropic(api_key=api_key)
self.set_model_name(model)
- self._max_tokens = params.max_tokens
- self._enable_prompt_caching_beta: bool = params.enable_prompt_caching_beta or False
- self._temperature = params.temperature
- self._top_k = params.top_k
- self._top_p = params.top_p
- self._extra = params.extra if isinstance(params.extra, dict) else {}
+ self._settings = {
+ "model": model,
+ "max_tokens": params.max_tokens,
+ "enable_prompt_caching_beta": params.enable_prompt_caching_beta or False,
+ "temperature": params.temperature,
+ "top_k": params.top_k,
+ "top_p": params.top_p,
+ "extra": params.extra if isinstance(params.extra, dict) else {},
+ }
def can_generate_metrics(self) -> bool:
return True
@@ -120,30 +123,6 @@ def create_context_aggregator(
)
return AnthropicContextAggregatorPair(_user=user, _assistant=assistant)
- async def set_enable_prompt_caching_beta(self, enable_prompt_caching_beta: bool):
- logger.debug(f"Switching LLM enable_prompt_caching_beta to: [{enable_prompt_caching_beta}]")
- self._enable_prompt_caching_beta = enable_prompt_caching_beta
-
- async def set_max_tokens(self, max_tokens: int):
- logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]")
- self._max_tokens = max_tokens
-
- async def set_temperature(self, temperature: float):
- logger.debug(f"Switching LLM temperature to: [{temperature}]")
- self._temperature = temperature
-
- async def set_top_k(self, top_k: float):
- logger.debug(f"Switching LLM top_k to: [{top_k}]")
- self._top_k = top_k
-
- async def set_top_p(self, top_p: float):
- logger.debug(f"Switching LLM top_p to: [{top_p}]")
- self._top_p = top_p
-
- async def set_extra(self, extra: Dict[str, Any]):
- logger.debug(f"Switching LLM extra to: [{extra}]")
- self._extra = extra
-
async def _process_context(self, context: OpenAILLMContext):
# Usage tracking. We track the usage reported by Anthropic in prompt_tokens and
# completion_tokens. We also estimate the completion tokens from output text
@@ -165,11 +144,11 @@ async def _process_context(self, context: OpenAILLMContext):
)
messages = context.messages
- if self._enable_prompt_caching_beta:
+ if self._settings["enable_prompt_caching_beta"]:
messages = context.get_messages_with_cache_control_markers()
api_call = self._client.messages.create
- if self._enable_prompt_caching_beta:
+ if self._settings["enable_prompt_caching_beta"]:
api_call = self._client.beta.prompt_caching.messages.create
await self.start_ttfb_metrics()
@@ -179,14 +158,14 @@ async def _process_context(self, context: OpenAILLMContext):
"system": context.system,
"messages": messages,
"model": self.model_name,
- "max_tokens": self._max_tokens,
+ "max_tokens": self._settings["max_tokens"],
"stream": True,
- "temperature": self._temperature,
- "top_k": self._top_k,
- "top_p": self._top_p,
+ "temperature": self._settings["temperature"],
+ "top_k": self._settings["top_k"],
+ "top_p": self._settings["top_p"],
}
- params.update(self._extra)
+ params.update(self._settings["extra"])
response = await api_call(**params)
@@ -286,12 +265,11 @@ async def _process_context(self, context: OpenAILLMContext):
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
- setter = getattr(self, f"set_{key}", None)
- if setter and callable(setter):
- try:
- await setter(value)
- except Exception as e:
- logger.warning(f"Error setting {key}: {e}")
+ if key in self._settings:
+ logger.debug(f"Updating LLM setting {key} to: [{value}]")
+ self._settings[key] = value
+ if key == "model":
+ self.set_model_name(value)
else:
logger.warning(f"Unknown setting for Anthropic LLM service: {key}")
@@ -309,11 +287,11 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
# UserImageRawFrames coming through the pipeline and add them
# to the context.
context = AnthropicLLMContext.from_image_frame(frame)
- elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm":
+ elif isinstance(frame, LLMUpdateSettingsFrame):
await self._update_settings(frame.settings)
elif isinstance(frame, LLMEnablePromptCachingFrame):
logger.debug(f"Setting enable prompt caching to: [{frame.enable}]")
- self._enable_prompt_caching_beta = frame.enable
+ self._settings["enable_prompt_caching_beta"] = frame.enable
else:
await self.push_frame(frame, direction)
diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py
index 80240985f..f3cc014b1 100644
--- a/src/pipecat/services/aws.py
+++ b/src/pipecat/services/aws.py
@@ -6,6 +6,7 @@
from typing import AsyncGenerator, Optional
+from loguru import logger
from pydantic import BaseModel
from pipecat.frames.frames import (
@@ -17,8 +18,6 @@
)
from pipecat.services.ai_services import TTSService
-from loguru import logger
-
try:
import boto3
from botocore.exceptions import BotoCoreError, ClientError
@@ -57,9 +56,15 @@ def __init__(
aws_secret_access_key=api_key,
region_name=region,
)
- self._voice_id = voice_id
- self._sample_rate = sample_rate
- self._params = params
+ self._settings = {
+ "voice_id": voice_id,
+ "sample_rate": sample_rate,
+ "engine": params.engine,
+ "language": params.language,
+ "pitch": params.pitch,
+ "rate": params.rate,
+ "volume": params.volume,
+ }
def can_generate_metrics(self) -> bool:
return True
@@ -67,18 +72,18 @@ def can_generate_metrics(self) -> bool:
def _construct_ssml(self, text: str) -> str:
ssml = ""
- if self._params.language:
- ssml += f""
+ if self._settings["language"]:
+ ssml += f""
prosody_attrs = []
# Prosody tags are only supported for standard and neural engines
- if self._params.engine != "generative":
- if self._params.rate:
- prosody_attrs.append(f"rate='{self._params.rate}'")
- if self._params.pitch:
- prosody_attrs.append(f"pitch='{self._params.pitch}'")
- if self._params.volume:
- prosody_attrs.append(f"volume='{self._params.volume}'")
+ if self._settings["engine"] != "generative":
+ if self._settings["rate"]:
+ prosody_attrs.append(f"rate='{self._settings["rate"]}'")
+ if self._settings["pitch"]:
+ prosody_attrs.append(f"pitch='{self._settings["pitch"]}'")
+ if self._settings["volume"]:
+ prosody_attrs.append(f"volume='{self._settings["volume"]}'")
if prosody_attrs:
ssml += f""
@@ -90,41 +95,13 @@ def _construct_ssml(self, text: str) -> str:
if prosody_attrs:
ssml += ""
- if self._params.language:
+ if self._settings["language"]:
ssml += ""
ssml += ""
return ssml
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
- async def set_engine(self, engine: str):
- logger.debug(f"Switching TTS engine to: [{engine}]")
- self._params.engine = engine
-
- async def set_language(self, language: str):
- logger.debug(f"Switching TTS language to: [{language}]")
- self._params.language = language
-
- async def set_pitch(self, pitch: str):
- logger.debug(f"Switching TTS pitch to: [{pitch}]")
- self._params.pitch = pitch
-
- async def set_rate(self, rate: str):
- logger.debug(f"Switching TTS rate to: [{rate}]")
- self._params.rate = rate
-
- async def set_volume(self, volume: str):
- logger.debug(f"Switching TTS volume to: [{volume}]")
- self._params.volume = volume
-
- async def set_params(self, params: InputParams):
- logger.debug(f"Switching TTS params to: [{params}]")
- self._params = params
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -138,9 +115,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
"Text": ssml,
"TextType": "ssml",
"OutputFormat": "pcm",
- "VoiceId": self._voice_id,
- "Engine": self._params.engine,
- "SampleRate": str(self._sample_rate),
+ "VoiceId": self._settings["voice_id"],
+ "Engine": self._settings["engine"],
+ "SampleRate": str(self._settings["sample_rate"]),
}
# Filter out None values
@@ -160,7 +137,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
chunk = audio_data[i : i + chunk_size]
if len(chunk) > 0:
await self.stop_ttfb_metrics()
- frame = TTSAudioRawFrame(chunk, self._sample_rate, 1)
+ frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py
index a1349cefe..19cd1889a 100644
--- a/src/pipecat/services/azure.py
+++ b/src/pipecat/services/azure.py
@@ -4,12 +4,13 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
-import aiohttp
import asyncio
import io
-
from typing import AsyncGenerator, Optional
+import aiohttp
+from loguru import logger
+from PIL import Image
from pydantic import BaseModel
from pipecat.frames.frames import (
@@ -28,10 +29,6 @@
from pipecat.services.openai import BaseOpenAILLMService
from pipecat.utils.time import time_now_iso8601
-from PIL import Image
-
-from loguru import logger
-
# See .env.example for Azure configuration needed
try:
from azure.cognitiveservices.speech import (
@@ -99,114 +96,66 @@ def __init__(
speech_config = SpeechConfig(subscription=api_key, region=region)
self._speech_synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None)
- self._voice = voice
- self._sample_rate = sample_rate
- self._params = params
+ self._settings = {
+ "voice": voice,
+ "sample_rate": sample_rate,
+ "emphasis": params.emphasis,
+ "language": params.language,
+ "pitch": params.pitch,
+ "rate": params.rate,
+ "role": params.role,
+ "style": params.style,
+ "style_degree": params.style_degree,
+ "volume": params.volume,
+ }
def can_generate_metrics(self) -> bool:
return True
def _construct_ssml(self, text: str) -> str:
ssml = (
- f""
- f""
+ f""
""
)
- if self._params.style:
- ssml += f""
prosody_attrs = []
- if self._params.rate:
- prosody_attrs.append(f"rate='{self._params.rate}'")
- if self._params.pitch:
- prosody_attrs.append(f"pitch='{self._params.pitch}'")
- if self._params.volume:
- prosody_attrs.append(f"volume='{self._params.volume}'")
+ if self._settings["rate"]:
+ prosody_attrs.append(f"rate='{self._settings['rate']}'")
+ if self._settings["pitch"]:
+ prosody_attrs.append(f"pitch='{self._settings['pitch']}'")
+ if self._settings["volume"]:
+ prosody_attrs.append(f"volume='{self._settings['volume']}'")
ssml += f""
- if self._params.emphasis:
- ssml += f""
+ if self._settings["emphasis"]:
+ ssml += f""
ssml += text
- if self._params.emphasis:
+ if self._settings["emphasis"]:
ssml += ""
ssml += ""
- if self._params.style:
+ if self._settings["style"]:
ssml += ""
ssml += ""
return ssml
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice = voice
-
- async def set_emphasis(self, emphasis: str):
- logger.debug(f"Setting TTS emphasis to: [{emphasis}]")
- self._params.emphasis = emphasis
-
- async def set_language(self, language: str):
- logger.debug(f"Setting TTS language code to: [{language}]")
- self._params.language = language
-
- async def set_pitch(self, pitch: str):
- logger.debug(f"Setting TTS pitch to: [{pitch}]")
- self._params.pitch = pitch
-
- async def set_rate(self, rate: str):
- logger.debug(f"Setting TTS rate to: [{rate}]")
- self._params.rate = rate
-
- async def set_role(self, role: str):
- logger.debug(f"Setting TTS role to: [{role}]")
- self._params.role = role
-
- async def set_style(self, style: str):
- logger.debug(f"Setting TTS style to: [{style}]")
- self._params.style = style
-
- async def set_style_degree(self, style_degree: str):
- logger.debug(f"Setting TTS style degree to: [{style_degree}]")
- self._params.style_degree = style_degree
-
- async def set_volume(self, volume: str):
- logger.debug(f"Setting TTS volume to: [{volume}]")
- self._params.volume = volume
-
- async def set_params(self, **kwargs):
- valid_params = {
- "voice": self.set_voice,
- "emphasis": self.set_emphasis,
- "language_code": self.set_language,
- "pitch": self.set_pitch,
- "rate": self.set_rate,
- "role": self.set_role,
- "style": self.set_style,
- "style_degree": self.set_style_degree,
- "volume": self.set_volume,
- }
-
- for param, value in kwargs.items():
- if param in valid_params:
- await valid_params[param](value)
- else:
- logger.warning(f"Ignoring unknown parameter: {param}")
-
- logger.debug(f"Updated TTS parameters: {', '.join(kwargs.keys())}")
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -222,7 +171,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
await self.push_frame(TTSStartedFrame())
# Azure always sends a 44-byte header. Strip it off.
yield TTSAudioRawFrame(
- audio=result.audio_data[44:], sample_rate=self._sample_rate, num_channels=1
+ audio=result.audio_data[44:],
+ sample_rate=self._settings["sample_rate"],
+ num_channels=1,
)
await self.push_frame(TTSStoppedFrame())
elif result.reason == ResultReason.Canceled:
diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index 5f798b1e5..1bbe22953 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -4,36 +4,35 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
+import asyncio
+import base64
import json
import uuid
-import base64
-import asyncio
+from typing import AsyncGenerator, List, Optional, Union
-from typing import AsyncGenerator, Optional, Union, List
+from loguru import logger
from pydantic.main import BaseModel
from pipecat.frames.frames import (
CancelFrame,
+ EndFrame,
ErrorFrame,
Frame,
- StartInterruptionFrame,
+ LLMFullResponseEndFrame,
StartFrame,
- EndFrame,
+ StartInterruptionFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
- LLMFullResponseEndFrame,
)
from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.ai_services import TTSService, WordTTSService
from pipecat.transcriptions.language import Language
-from pipecat.services.ai_services import WordTTSService, TTSService
-
-from loguru import logger
# See .env.example for Cartesia configuration needed
try:
- from cartesia import AsyncCartesia
import websockets
+ from cartesia import AsyncCartesia
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
@@ -101,17 +100,19 @@ def __init__(
self._api_key = api_key
self._cartesia_version = cartesia_version
self._url = url
- self._voice_id = voice_id
- self._model_id = model_id
- self.set_model_name(model_id)
- self._output_format = {
- "container": params.container,
- "encoding": params.encoding,
- "sample_rate": params.sample_rate,
+ self._settings = {
+ "voice_id": voice_id,
+ "model_id": model_id,
+ "output_format": {
+ "container": params.container,
+ "encoding": params.encoding,
+ "sample_rate": params.sample_rate,
+ },
+ "language": params.language,
+ "speed": params.speed,
+ "emotion": params.emotion,
}
- self._language = params.language
- self._speed = params.speed
- self._emotion = params.emotion
+ self.set_model_name(model_id)
self._websocket = None
self._context_id = None
@@ -121,37 +122,21 @@ def can_generate_metrics(self) -> bool:
return True
async def set_model(self, model: str):
- self._model_id = model
+ self._settings["model_id"] = model
await super().set_model(model)
logger.debug(f"Switching TTS model to: [{model}]")
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
- async def set_speed(self, speed: str):
- logger.debug(f"Switching TTS speed to: [{speed}]")
- self._speed = speed
-
- async def set_emotion(self, emotion: list[str]):
- logger.debug(f"Switching TTS emotion to: [{emotion}]")
- self._emotion = emotion
-
- async def set_language(self, language: Language):
- logger.debug(f"Switching TTS language to: [{language}]")
- self._language = language_to_cartesia_language(language)
-
def _build_msg(
self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True
):
- voice_config = {"mode": "id", "id": self._voice_id}
+ voice_config = {"mode": "id", "id": self._settings["voice_id"]}
- if self._speed or self._emotion:
+ if self._settings["speed"] or self._settings["emotion"]:
voice_config["__experimental_controls"] = {}
- if self._speed:
- voice_config["__experimental_controls"]["speed"] = self._speed
- if self._emotion:
- voice_config["__experimental_controls"]["emotion"] = self._emotion
+ if self._settings["speed"]:
+ voice_config["__experimental_controls"]["speed"] = self._settings["speed"]
+ if self._settings["emotion"]:
+ voice_config["__experimental_controls"]["emotion"] = self._settings["emotion"]
msg = {
"transcript": text,
@@ -159,8 +144,8 @@ def _build_msg(
"context_id": self._context_id,
"model_id": self._model_name,
"voice": voice_config,
- "output_format": self._output_format,
- "language": self._language,
+ "output_format": self._settings["output_format"],
+ "language": self._settings["language"],
"add_timestamps": add_timestamps,
}
return json.dumps(msg)
@@ -245,7 +230,7 @@ async def _receive_task_handler(self):
self.start_word_timestamps()
frame = TTSAudioRawFrame(
audio=base64.b64decode(msg["data"]),
- sample_rate=self._output_format["sample_rate"],
+ sample_rate=self._settings["output_format"]["sample_rate"],
num_channels=1,
)
await self.push_frame(frame)
@@ -311,17 +296,19 @@ def __init__(
super().__init__(**kwargs)
self._api_key = api_key
- self._voice_id = voice_id
- self._model_id = model_id
- self.set_model_name(model_id)
- self._output_format = {
- "container": params.container,
- "encoding": params.encoding,
- "sample_rate": params.sample_rate,
+ self._settings = {
+ "voice_id": voice_id,
+ "model_id": model_id,
+ "output_format": {
+ "container": params.container,
+ "encoding": params.encoding,
+ "sample_rate": params.sample_rate,
+ },
+ "language": params.language,
+ "speed": params.speed,
+ "emotion": params.emotion,
}
- self._language = params.language
- self._speed = params.speed
- self._emotion = params.emotion
+ self.set_model_name(model_id)
self._client = AsyncCartesia(api_key=api_key, base_url=base_url)
@@ -333,22 +320,6 @@ async def set_model(self, model: str):
self._model_id = model
await super().set_model(model)
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
- async def set_speed(self, speed: str):
- logger.debug(f"Switching TTS speed to: [{speed}]")
- self._speed = speed
-
- async def set_emotion(self, emotion: list[str]):
- logger.debug(f"Switching TTS emotion to: [{emotion}]")
- self._emotion = emotion
-
- async def set_language(self, language: Language):
- logger.debug(f"Switching TTS language to: [{language}]")
- self._language = language_to_cartesia_language(language)
-
async def stop(self, frame: EndFrame):
await super().stop(frame)
await self._client.close()
@@ -365,19 +336,19 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
try:
voice_controls = None
- if self._speed or self._emotion:
+ if self._settings["speed"] or self._settings["emotion"]:
voice_controls = {}
- if self._speed:
- voice_controls["speed"] = self._speed
- if self._emotion:
- voice_controls["emotion"] = self._emotion
+ if self._settings["speed"]:
+ voice_controls["speed"] = self._settings["speed"]
+ if self._settings["emotion"]:
+ voice_controls["emotion"] = self._settings["emotion"]
output = await self._client.tts.sse(
- model_id=self._model_id,
+ model_id=self._settings["model_id"],
transcript=text,
- voice_id=self._voice_id,
- output_format=self._output_format,
- language=self._language,
+ voice_id=self._settings["voice_id"],
+ output_format=self._settings["output_format"],
+ language=self._settings["language"],
stream=False,
_experimental_voice_controls=voice_controls,
)
@@ -386,7 +357,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
frame = TTSAudioRawFrame(
audio=output["audio"],
- sample_rate=self._output_format["sample_rate"],
+ sample_rate=self._settings["output_format"]["sample_rate"],
num_channels=1,
)
yield frame
diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py
index d109cce3c..28f472e55 100644
--- a/src/pipecat/services/deepgram.py
+++ b/src/pipecat/services/deepgram.py
@@ -5,9 +5,10 @@
#
import asyncio
-
from typing import AsyncGenerator
+from loguru import logger
+
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
@@ -24,8 +25,6 @@
from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
-from loguru import logger
-
# See .env.example for Deepgram configuration needed
try:
from deepgram import (
@@ -57,25 +56,23 @@ def __init__(
):
super().__init__(**kwargs)
- self._voice = voice
- self._sample_rate = sample_rate
- self._encoding = encoding
+ self._settings = {
+ "voice": voice,
+ "sample_rate": sample_rate,
+ "encoding": encoding,
+ }
self._deepgram_client = DeepgramClient(api_key=api_key)
def can_generate_metrics(self) -> bool:
return True
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice = voice
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
options = SpeakOptions(
- model=self._voice,
- encoding=self._encoding,
- sample_rate=self._sample_rate,
+ model=self._settings["voice"],
+ encoding=self._settings["encoding"],
+ sample_rate=self._settings["sample_rate"],
container="none",
)
@@ -103,7 +100,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
chunk = audio_buffer.read(chunk_size)
if not chunk:
break
- frame = TTSAudioRawFrame(audio=chunk, sample_rate=self._sample_rate, num_channels=1)
+ frame = TTSAudioRawFrame(
+ audio=chunk, sample_rate=self._settings["sample_rate"], num_channels=1
+ )
yield frame
await self.push_frame(TTSStoppedFrame())
@@ -135,7 +134,9 @@ def __init__(
):
super().__init__(**kwargs)
- self._live_options = live_options
+ self._settings = {
+ "live_options": live_options,
+ }
self._client = DeepgramClient(
api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"})
@@ -147,7 +148,7 @@ def __init__(
@property
def vad_enabled(self):
- return self._live_options.vad_events
+ return self._settings["live_options"].vad_events
def can_generate_metrics(self) -> bool:
return self.vad_enabled
@@ -155,13 +156,7 @@ def can_generate_metrics(self) -> bool:
async def set_model(self, model: str):
await super().set_model(model)
logger.debug(f"Switching STT model to: [{model}]")
- self._live_options.model = model
- await self._disconnect()
- await self._connect()
-
- async def set_language(self, language: Language):
- logger.debug(f"Switching STT language to: [{language}]")
- self._live_options.language = language
+ self._settings["live_options"].model = model
await self._disconnect()
await self._connect()
@@ -182,7 +177,7 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
yield None
async def _connect(self):
- if await self._connection.start(self._live_options):
+ if await self._connection.start(self._settings["live_options"]):
logger.debug(f"{self}: Connected to Deepgram")
else:
logger.error(f"{self}: Unable to connect to Deepgram")
diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py
index 611f2a024..6b05a1f22 100644
--- a/src/pipecat/services/elevenlabs.py
+++ b/src/pipecat/services/elevenlabs.py
@@ -124,10 +124,20 @@ def __init__(
)
self._api_key = api_key
- self._voice_id = voice_id
- self.set_model_name(model)
self._url = url
- self._params = params
+ self._settings = {
+ "voice_id": voice_id,
+ "sample_rate": sample_rate_from_output_format(params.output_format),
+ "model": model,
+ "language": params.language,
+ "output_format": params.output_format,
+ "optimize_streaming_latency": params.optimize_streaming_latency,
+ "stability": params.stability,
+ "similarity_boost": params.similarity_boost,
+ "style": params.style,
+ "use_speaker_boost": params.use_speaker_boost,
+ }
+ self.set_model_name(model)
self._voice_settings = self._set_voice_settings()
# Websocket connection to ElevenLabs.
@@ -142,19 +152,22 @@ def can_generate_metrics(self) -> bool:
def _set_voice_settings(self):
voice_settings = {}
- if self._params.stability is not None and self._params.similarity_boost is not None:
- voice_settings["stability"] = self._params.stability
- voice_settings["similarity_boost"] = self._params.similarity_boost
- if self._params.style is not None:
- voice_settings["style"] = self._params.style
- if self._params.use_speaker_boost is not None:
- voice_settings["use_speaker_boost"] = self._params.use_speaker_boost
+ if (
+ self._settings["stability"] is not None
+ and self._settings["similarity_boost"] is not None
+ ):
+ voice_settings["stability"] = self._settings["stability"]
+ voice_settings["similarity_boost"] = self._settings["similarity_boost"]
+ if self._settings["style"] is not None:
+ voice_settings["style"] = self._settings["style"]
+ if self._settings["use_speaker_boost"] is not None:
+ voice_settings["use_speaker_boost"] = self._settings["use_speaker_boost"]
else:
- if self._params.style is not None:
+ if self._settings["style"] is not None:
logger.warning(
"'style' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)
- if self._params.use_speaker_boost is not None:
+ if self._settings["use_speaker_boost"] is not None:
logger.warning(
"'use_speaker_boost' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)
@@ -167,34 +180,6 @@ async def set_model(self, model: str):
await self._disconnect()
await self._connect()
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
- await self._disconnect()
- await self._connect()
-
- async def set_voice_settings(
- self,
- stability: Optional[float] = None,
- similarity_boost: Optional[float] = None,
- style: Optional[float] = None,
- use_speaker_boost: Optional[bool] = None,
- ):
- self._params.stability = stability if stability is not None else self._params.stability
- self._params.similarity_boost = (
- similarity_boost if similarity_boost is not None else self._params.similarity_boost
- )
- self._params.style = style if style is not None else self._params.style
- self._params.use_speaker_boost = (
- use_speaker_boost if use_speaker_boost is not None else self._params.use_speaker_boost
- )
-
- self._set_voice_settings()
-
- if self._websocket:
- msg = {"voice_settings": self._voice_settings}
- await self._websocket.send(json.dumps(msg))
-
async def start(self, frame: StartFrame):
await super().start(frame)
await self._connect()
@@ -221,21 +206,21 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect
async def _connect(self):
try:
- voice_id = self._voice_id
- model = self.model_name
- output_format = self._params.output_format
+ voice_id = self._settings["voice_id"]
+ model = self._settings["model"]
+ output_format = self._settings["output_format"]
url = f"{self._url}/v1/text-to-speech/{voice_id}/stream-input?model_id={model}&output_format={output_format}"
- if self._params.optimize_streaming_latency:
- url += f"&optimize_streaming_latency={self._params.optimize_streaming_latency}"
+ if self._settings["optimize_streaming_latency"]:
+ url += f"&optimize_streaming_latency={self._settings["optimize_streaming_latency"]}"
# language can only be used with the 'eleven_turbo_v2_5' model
- if self._params.language:
+ if self._settings["language"]:
if model == "eleven_turbo_v2_5":
- url += f"&language_code={self._params.language}"
+ url += f"&language_code={self._settings["language"]}"
else:
logger.debug(
- f"Language code [{self._params.language}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
+ f"Language code [{self._settings["language"]}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
)
self._websocket = await websockets.connect(url)
@@ -286,7 +271,7 @@ async def _receive_task_handler(self):
self.start_word_timestamps()
audio = base64.b64decode(msg["audio"])
- frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
+ frame = TTSAudioRawFrame(audio, self._settings["sample_rate"], 1)
await self.push_frame(frame)
if msg.get("alignment"):
diff --git a/src/pipecat/services/gladia.py b/src/pipecat/services/gladia.py
index a590d73cf..7537b780c 100644
--- a/src/pipecat/services/gladia.py
+++ b/src/pipecat/services/gladia.py
@@ -6,8 +6,9 @@
import base64
import json
-
from typing import AsyncGenerator, Optional
+
+from loguru import logger
from pydantic.main import BaseModel
from pipecat.frames.frames import (
@@ -21,8 +22,6 @@
from pipecat.services.ai_services import STTService
from pipecat.utils.time import time_now_iso8601
-from loguru import logger
-
# See .env.example for Gladia configuration needed
try:
import websockets
@@ -55,8 +54,14 @@ def __init__(
self._api_key = api_key
self._url = url
- self._params = params
- self._confidence = confidence
+ self._settings = {
+ "confidence": confidence,
+ "sample_rate": params.sample_rate,
+ "language": params.language,
+ "transcription_hint": params.transcription_hint,
+ "endpointing": params.endpointing,
+ "prosody": params.prosody,
+ }
async def start(self, frame: StartFrame):
await super().start(frame)
@@ -84,9 +89,15 @@ async def _setup_gladia(self):
"encoding": "WAV/PCM",
"model_type": "fast",
"language_behaviour": "manual",
- **self._params.model_dump(exclude_none=True),
+ "sample_rate": self._settings["sample_rate"],
+ "language": self._settings["language"],
+ "transcription_hint": self._settings["transcription_hint"],
+ "endpointing": self._settings["endpointing"],
+ "prosody": self._settings["prosody"],
}
+ configuration = {k: v for k, v in configuration.items() if v is not None}
+
await self._websocket.send(json.dumps(configuration))
async def _send_audio(self, audio: bytes):
@@ -106,7 +117,7 @@ async def _receive_task_handler(self):
type = utterance["type"]
confidence = utterance["confidence"]
transcript = utterance["transcription"]
- if confidence >= self._confidence:
+ if confidence >= self._settings["confidence"]:
if type == "final":
await self.push_frame(
TranscriptionFrame(transcript, "", time_now_iso8601())
diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py
index 092b4703e..a759e4920 100644
--- a/src/pipecat/services/google.py
+++ b/src/pipecat/services/google.py
@@ -17,7 +17,7 @@
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMMessagesFrame,
- ServiceUpdateSettingsFrame,
+ LLMUpdateSettingsFrame,
TextFrame,
TTSAudioRawFrame,
TTSStartedFrame,
@@ -55,6 +55,9 @@ class GoogleLLMService(LLMService):
def __init__(self, *, api_key: str, model: str = "gemini-1.5-flash-latest", **kwargs):
super().__init__(**kwargs)
gai.configure(api_key=api_key)
+ self._settings = {
+ "model": model,
+ }
self._create_client(model)
def can_generate_metrics(self) -> bool:
@@ -64,18 +67,13 @@ def _create_client(self, model: str):
self.set_model_name(model)
self._client = gai.GenerativeModel(model)
- async def set_model(self, model: str):
- logger.debug(f"Switching LLM model to: [{model}]")
- self._create_client(model)
-
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
- setter = getattr(self, f"set_{key}", None)
- if setter and callable(setter):
- try:
- await setter(value)
- except Exception as e:
- logger.warning(f"Error setting {key}: {e}")
+ if key in self._settings:
+ logger.debug(f"Updating LLM setting {key} to: [{value}]")
+ self._settings[key] = value
+ if key == "model":
+ self._create_client(value)
else:
logger.warning(f"Unknown setting for Google LLM service: {key}")
@@ -151,7 +149,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
context = OpenAILLMContext.from_messages(frame.messages)
elif isinstance(frame, VisionImageRawFrame):
context = OpenAILLMContext.from_image_frame(frame)
- elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm":
+ elif isinstance(frame, LLMUpdateSettingsFrame):
await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
@@ -182,8 +180,17 @@ def __init__(
):
super().__init__(sample_rate=sample_rate, **kwargs)
- self._voice_id: str = voice_id
- self._params = params
+ self._settings = {
+ "voice_id": voice_id,
+ "sample_rate": sample_rate,
+ "pitch": params.pitch,
+ "rate": params.rate,
+ "volume": params.volume,
+ "emphasis": params.emphasis,
+ "language": params.language,
+ "gender": params.gender,
+ "google_style": params.google_style,
+ }
self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
credentials, credentials_path
)
@@ -215,39 +222,39 @@ def _construct_ssml(self, text: str) -> str:
ssml = ""
# Voice tag
- voice_attrs = [f"name='{self._voice_id}'"]
- if self._params.language:
- voice_attrs.append(f"language='{self._params.language}'")
- if self._params.gender:
- voice_attrs.append(f"gender='{self._params.gender}'")
+ voice_attrs = [f"name='{self._settings["voice_id"]}'"]
+ if self._settings["language"]:
+ voice_attrs.append(f"language='{self._settings['language']}'")
+ if self._settings["gender"]:
+ voice_attrs.append(f"gender='{self._settings['gender']}'")
ssml += f""
# Prosody tag
prosody_attrs = []
- if self._params.pitch:
- prosody_attrs.append(f"pitch='{self._params.pitch}'")
- if self._params.rate:
- prosody_attrs.append(f"rate='{self._params.rate}'")
- if self._params.volume:
- prosody_attrs.append(f"volume='{self._params.volume}'")
+ if self._settings["pitch"]:
+ prosody_attrs.append(f"pitch='{self._settings['pitch']}'")
+ if self._settings["rate"]:
+ prosody_attrs.append(f"rate='{self._settings['rate']}'")
+ if self._settings["volume"]:
+ prosody_attrs.append(f"volume='{self._settings['volume']}'")
if prosody_attrs:
ssml += f""
# Emphasis tag
- if self._params.emphasis:
- ssml += f""
+ if self._settings["emphasis"]:
+ ssml += f""
# Google style tag
- if self._params.google_style:
- ssml += f""
+ if self._settings["google_style"]:
+ ssml += f""
ssml += text
# Close tags
- if self._params.google_style:
+ if self._settings["google_style"]:
ssml += ""
- if self._params.emphasis:
+ if self._settings["emphasis"]:
ssml += ""
if prosody_attrs:
ssml += ""
@@ -255,46 +262,6 @@ def _construct_ssml(self, text: str) -> str:
return ssml
- async def set_voice(self, voice: str) -> None:
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
- async def set_language(self, language: str) -> None:
- logger.debug(f"Switching TTS language to: [{language}]")
- self._params.language = language
-
- async def set_pitch(self, pitch: str) -> None:
- logger.debug(f"Switching TTS pitch to: [{pitch}]")
- self._params.pitch = pitch
-
- async def set_rate(self, rate: str) -> None:
- logger.debug(f"Switching TTS rate to: [{rate}]")
- self._params.rate = rate
-
- async def set_volume(self, volume: str) -> None:
- logger.debug(f"Switching TTS volume to: [{volume}]")
- self._params.volume = volume
-
- async def set_emphasis(
- self, emphasis: Literal["strong", "moderate", "reduced", "none"]
- ) -> None:
- logger.debug(f"Switching TTS emphasis to: [{emphasis}]")
- self._params.emphasis = emphasis
-
- async def set_gender(self, gender: Literal["male", "female", "neutral"]) -> None:
- logger.debug(f"Switch TTS gender to [{gender}]")
- self._params.gender = gender
-
- async def set_google_style(
- self, google_style: Literal["apologetic", "calm", "empathetic", "firm", "lively"]
- ) -> None:
- logger.debug(f"Switching TTS google style to: [{google_style}]")
- self._params.google_style = google_style
-
- async def set_params(self, params: InputParams) -> None:
- logger.debug(f"Switching TTS params to: [{params}]")
- self._params = params
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -304,11 +271,11 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
ssml = self._construct_ssml(text)
synthesis_input = texttospeech_v1.SynthesisInput(ssml=ssml)
voice = texttospeech_v1.VoiceSelectionParams(
- language_code=self._params.language, name=self._voice_id
+ language_code=self._settings["language"], name=self._settings["voice_id"]
)
audio_config = texttospeech_v1.AudioConfig(
audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16,
- sample_rate_hertz=self.sample_rate,
+ sample_rate_hertz=self._settings["sample_rate"],
)
request = texttospeech_v1.SynthesizeSpeechRequest(
@@ -331,7 +298,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
if not chunk:
break
await self.stop_ttfb_metrics()
- frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
+ frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await asyncio.sleep(0) # Allow other tasks to run
diff --git a/src/pipecat/services/lmnt.py b/src/pipecat/services/lmnt.py
index 8f18002c5..2427aeb07 100644
--- a/src/pipecat/services/lmnt.py
+++ b/src/pipecat/services/lmnt.py
@@ -5,10 +5,10 @@
#
import asyncio
-
from typing import AsyncGenerator
-from pipecat.processors.frame_processor import FrameDirection
+from loguru import logger
+
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
@@ -20,10 +20,9 @@
TTSStartedFrame,
TTSStoppedFrame,
)
+from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import TTSService
-from loguru import logger
-
# See .env.example for LMNT configuration needed
try:
from lmnt.api import Speech
@@ -50,13 +49,15 @@ def __init__(
super().__init__(push_stop_frames=True, sample_rate=sample_rate, **kwargs)
self._api_key = api_key
- self._voice_id = voice_id
- self._output_format = {
- "container": "raw",
- "encoding": "pcm_s16le",
- "sample_rate": sample_rate,
+ self._settings = {
+ "voice_id": voice_id,
+ "output_format": {
+ "container": "raw",
+ "encoding": "pcm_s16le",
+ "sample_rate": sample_rate,
+ },
+ "language": language,
}
- self._language = language
self._speech = None
self._connection = None
@@ -68,10 +69,6 @@ def __init__(
def can_generate_metrics(self) -> bool:
return True
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
async def start(self, frame: StartFrame):
await super().start(frame)
await self._connect()
@@ -93,7 +90,9 @@ async def _connect(self):
try:
self._speech = Speech()
self._connection = await self._speech.synthesize_streaming(
- self._voice_id, format="raw", sample_rate=self._output_format["sample_rate"]
+ self._settings["voice_id"],
+ format="raw",
+ sample_rate=self._settings["output_format"]["sample_rate"],
)
self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
except Exception as e:
@@ -130,7 +129,7 @@ async def _receive_task_handler(self):
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(
audio=msg["audio"],
- sample_rate=self._output_format["sample_rate"],
+ sample_rate=self._settings["output_format"]["sample_rate"],
num_channels=1,
)
await self.push_frame(frame)
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index 57c1b3399..8b479294a 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -24,7 +24,7 @@
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMMessagesFrame,
- ServiceUpdateSettingsFrame,
+ LLMUpdateSettingsFrame,
StartInterruptionFrame,
TextFrame,
TTSAudioRawFrame,
@@ -111,14 +111,17 @@ def __init__(
**kwargs,
):
super().__init__(**kwargs)
+ self._settings = {
+ "model": model,
+ "frequency_penalty": params.frequency_penalty,
+ "presence_penalty": params.presence_penalty,
+ "seed": params.seed,
+ "temperature": params.temperature,
+ "top_p": params.top_p,
+ "extra": params.extra if isinstance(params.extra, dict) else {},
+ }
self.set_model_name(model)
self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs)
- self._frequency_penalty = params.frequency_penalty
- self._presence_penalty = params.presence_penalty
- self._seed = params.seed
- self._temperature = params.temperature
- self._top_p = params.top_p
- self._extra = params.extra if isinstance(params.extra, dict) else {}
def create_client(self, api_key=None, base_url=None, **kwargs):
return AsyncOpenAI(
@@ -134,30 +137,6 @@ def create_client(self, api_key=None, base_url=None, **kwargs):
def can_generate_metrics(self) -> bool:
return True
- async def set_frequency_penalty(self, frequency_penalty: float):
- logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]")
- self._frequency_penalty = frequency_penalty
-
- async def set_presence_penalty(self, presence_penalty: float):
- logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]")
- self._presence_penalty = presence_penalty
-
- async def set_seed(self, seed: int):
- logger.debug(f"Switching LLM seed to: [{seed}]")
- self._seed = seed
-
- async def set_temperature(self, temperature: float):
- logger.debug(f"Switching LLM temperature to: [{temperature}]")
- self._temperature = temperature
-
- async def set_top_p(self, top_p: float):
- logger.debug(f"Switching LLM top_p to: [{top_p}]")
- self._top_p = top_p
-
- async def set_extra(self, extra: Dict[str, Any]):
- logger.debug(f"Switching LLM extra to: [{extra}]")
- self._extra = extra
-
async def get_chat_completions(
self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
) -> AsyncStream[ChatCompletionChunk]:
@@ -168,14 +147,14 @@ async def get_chat_completions(
"tools": context.tools,
"tool_choice": context.tool_choice,
"stream_options": {"include_usage": True},
- "frequency_penalty": self._frequency_penalty,
- "presence_penalty": self._presence_penalty,
- "seed": self._seed,
- "temperature": self._temperature,
- "top_p": self._top_p,
+ "frequency_penalty": self._settings["frequency_penalty"],
+ "presence_penalty": self._settings["presence_penalty"],
+ "seed": self._settings["seed"],
+ "temperature": self._settings["temperature"],
+ "top_p": self._settings["top_p"],
}
- params.update(self._extra)
+ params.update(self._settings["extra"])
chunks = await self._client.chat.completions.create(**params)
return chunks
@@ -297,12 +276,11 @@ async def _process_context(self, context: OpenAILLMContext):
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
- setter = getattr(self, f"set_{key}", None)
- if setter and callable(setter):
- try:
- await setter(value)
- except Exception as e:
- logger.warning(f"Error setting {key}: {e}")
+ if key in self._settings:
+ logger.debug(f"Updating LLM setting {key} to: [{value}]")
+ self._settings[key] = value
+ if key == "model":
+ self.set_model_name(value)
else:
logger.warning(f"Unknown setting for OpenAI LLM service: {key}")
@@ -316,7 +294,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
context = OpenAILLMContext.from_messages(frame.messages)
elif isinstance(frame, VisionImageRawFrame):
context = OpenAILLMContext.from_image_frame(frame)
- elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm":
+ elif isinstance(frame, LLMUpdateSettingsFrame):
await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
@@ -421,22 +399,21 @@ def __init__(
):
super().__init__(sample_rate=sample_rate, **kwargs)
- self._voice: ValidVoice = VALID_VOICES.get(voice, "alloy")
+ self._settings = {
+ "voice": VALID_VOICES.get(voice, "alloy"),
+ "model": model,
+ "sample_rate": sample_rate,
+ }
self.set_model_name(model)
- self._sample_rate = sample_rate
self._client = AsyncOpenAI(api_key=api_key)
def can_generate_metrics(self) -> bool:
return True
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice = VALID_VOICES.get(voice, self._voice)
-
async def set_model(self, model: str):
logger.debug(f"Switching TTS model to: [{model}]")
- self._model = model
+ self._settings["model"] = model
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -445,8 +422,8 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
async with self._client.audio.speech.with_streaming_response.create(
input=text,
- model=self.model_name,
- voice=self._voice,
+ model=self._settings["model"],
+ voice=self._settings["voice"],
response_format="pcm",
) as r:
if r.status_code != 200:
@@ -465,7 +442,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
async for chunk in r.iter_bytes(8192):
if len(chunk) > 0:
await self.stop_ttfb_metrics()
- frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
+ frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
except BadRequestError as e:
diff --git a/src/pipecat/services/playht.py b/src/pipecat/services/playht.py
index 2ffa3a419..44ed9145e 100644
--- a/src/pipecat/services/playht.py
+++ b/src/pipecat/services/playht.py
@@ -6,17 +6,21 @@
import io
import struct
-
from typing import AsyncGenerator
-from pipecat.frames.frames import Frame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame
-from pipecat.services.ai_services import TTSService
-
from loguru import logger
+from pipecat.frames.frames import (
+ Frame,
+ TTSAudioRawFrame,
+ TTSStartedFrame,
+ TTSStoppedFrame,
+)
+from pipecat.services.ai_services import TTSService
+
try:
- from pyht.client import TTSOptions
from pyht.async_client import AsyncClient
+ from pyht.client import TTSOptions
from pyht.protos.api_pb2 import Format
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
@@ -39,17 +43,23 @@ def __init__(
user_id=self._user_id,
api_key=self._speech_key,
)
+ self._settings = {
+ "voice": voice_url,
+ "sample_rate": sample_rate,
+ "quality": "higher",
+ "format": Format.FORMAT_WAV,
+ "voice_engine": "PlayHT2.0-turbo",
+ }
self._options = TTSOptions(
- voice=voice_url, sample_rate=sample_rate, quality="higher", format=Format.FORMAT_WAV
+ voice=self._settings["voice"],
+ sample_rate=self._settings["sample_rate"],
+ quality=self._settings["quality"],
+ format=self._settings["format"],
)
def can_generate_metrics(self) -> bool:
return True
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._options.voice = voice
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -60,7 +70,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
await self.start_ttfb_metrics()
playht_gen = self._client.tts(
- text, voice_engine="PlayHT2.0-turbo", options=self._options
+ text, voice_engine=self._settings["voice_engine"], options=self._options
)
await self.start_tts_usage_metrics(text)
@@ -83,7 +93,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
else:
if len(chunk):
await self.stop_ttfb_metrics()
- frame = TTSAudioRawFrame(chunk, 16000, 1)
+ frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
except Exception as e:
diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py
index 26a1a99fd..5fe11a179 100644
--- a/src/pipecat/services/together.py
+++ b/src/pipecat/services/together.py
@@ -50,13 +50,17 @@ def __init__(
):
super().__init__(api_key=api_key, base_url=base_url, model=model, params=params, **kwargs)
self.set_model_name(model)
- self._max_tokens = params.max_tokens
- self._frequency_penalty = params.frequency_penalty
- self._presence_penalty = params.presence_penalty
- self._temperature = params.temperature
- self._top_k = params.top_k
- self._top_p = params.top_p
- self._extra = params.extra if isinstance(params.extra, dict) else {}
+ self._settings = {
+ "model": model,
+ "max_tokens": params.max_tokens,
+ "frequency_penalty": params.frequency_penalty,
+ "presence_penalty": params.presence_penalty,
+ "temperature": params.temperature,
+ "top_k": params.top_k,
+ "top_p": params.top_p,
+ "extra": params.extra if isinstance(params.extra, dict) else {},
+ "seed": params.seed,
+ }
def can_generate_metrics(self) -> bool:
return True
@@ -73,41 +77,12 @@ def create_client(self, api_key=None, base_url=None, **kwargs):
),
)
- async def set_frequency_penalty(self, frequency_penalty: float):
- logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]")
- self._frequency_penalty = frequency_penalty
-
- async def set_max_tokens(self, max_tokens: int):
- logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]")
- self._max_tokens = max_tokens
-
- async def set_presence_penalty(self, presence_penalty: float):
- logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]")
- self._presence_penalty = presence_penalty
-
- async def set_temperature(self, temperature: float):
- logger.debug(f"Switching LLM temperature to: [{temperature}]")
- self._temperature = temperature
-
- async def set_top_k(self, top_k: float):
- logger.debug(f"Switching LLM top_k to: [{top_k}]")
- self._top_k = top_k
-
- async def set_top_p(self, top_p: float):
- logger.debug(f"Switching LLM top_p to: [{top_p}]")
- self._top_p = top_p
-
- async def set_extra(self, extra: Dict[str, Any]):
- logger.debug(f"Switching LLM extra to: [{extra}]")
- self._extra = extra
-
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
- setter = getattr(self, f"set_{key}", None)
- if setter and callable(setter):
- try:
- await setter(value)
- except Exception as e:
- logger.warning(f"Error setting {key}: {e}")
+ if key in self._settings:
+ logger.debug(f"Updating LLM setting {key} to: [{value}]")
+ self._settings[key] = value
+ if key == "model":
+ self.set_model_name(value)
else:
logger.warning(f"Unknown setting for Together LLM service: {key}")
diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py
index 2c47d59e8..c833735d0 100644
--- a/src/pipecat/services/xtts.py
+++ b/src/pipecat/services/xtts.py
@@ -4,10 +4,12 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
-import aiohttp
-
from typing import Any, AsyncGenerator, Dict
+import aiohttp
+import numpy as np
+from loguru import logger
+
from pipecat.frames.frames import (
ErrorFrame,
Frame,
@@ -18,10 +20,6 @@
)
from pipecat.services.ai_services import TTSService
-import numpy as np
-
-from loguru import logger
-
try:
import resampy
except ModuleNotFoundError as e:
@@ -50,9 +48,11 @@ def __init__(
):
super().__init__(**kwargs)
- self._voice_id = voice_id
- self._language = language
- self._base_url = base_url
+ self._settings = {
+ "voice_id": voice_id,
+ "language": language,
+ "base_url": base_url,
+ }
self._studio_speakers: Dict[str, Any] | None = None
self._aiohttp_session = aiohttp_session
@@ -61,7 +61,7 @@ def can_generate_metrics(self) -> bool:
async def start(self, frame: StartFrame):
await super().start(frame)
- async with self._aiohttp_session.get(self._base_url + "/studio_speakers") as r:
+ async with self._aiohttp_session.get(self._settings["base_url"] + "/studio_speakers") as r:
if r.status != 200:
text = await r.text()
logger.error(
@@ -75,10 +75,6 @@ async def start(self, frame: StartFrame):
return
self._studio_speakers = await r.json()
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -86,13 +82,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.error(f"{self} no studio speakers available")
return
- embeddings = self._studio_speakers[self._voice_id]
+ embeddings = self._studio_speakers[self._settings["voice_id"]]
- url = self._base_url + "/tts_stream"
+ url = self._settings["base_url"] + "/tts_stream"
payload = {
"text": text.replace(".", "").replace("*", ""),
- "language": self._language,
+ "language": self._settings["language"],
"speaker_embedding": embeddings["speaker_embedding"],
"gpt_cond_latent": embeddings["gpt_cond_latent"],
"add_wav_header": False,