From 08d0738e3870f810c6fcb4226ebf418802ff79f4 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 1 Oct 2024 14:37:07 -0400 Subject: [PATCH] Update to use LLM, STT, TTS subclasses and remove setter methods --- src/pipecat/frames/frames.py | 16 +++- src/pipecat/services/ai_services.py | 109 ++++------------------ src/pipecat/services/anthropic.py | 70 +++++---------- src/pipecat/services/aws.py | 71 +++++---------- src/pipecat/services/azure.py | 121 ++++++++----------------- src/pipecat/services/cartesia.py | 135 +++++++++++----------------- src/pipecat/services/deepgram.py | 43 ++++----- src/pipecat/services/elevenlabs.py | 83 +++++++---------- src/pipecat/services/gladia.py | 25 ++++-- src/pipecat/services/google.py | 115 +++++++++--------------- src/pipecat/services/lmnt.py | 31 ++++--- src/pipecat/services/openai.py | 85 +++++++----------- src/pipecat/services/playht.py | 34 ++++--- src/pipecat/services/together.py | 57 ++++-------- src/pipecat/services/xtts.py | 30 +++---- 15 files changed, 380 insertions(+), 645 deletions(-) diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 7cb93606d..e2ef78df5 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -530,10 +530,24 @@ def __str__(self): class ServiceUpdateSettingsFrame(ControlFrame): """A control frame containing a request to update service settings.""" - service_type: str settings: Dict[str, Any] +@dataclass +class LLMUpdateSettingsFrame(ServiceUpdateSettingsFrame): + pass + + +@dataclass +class TTSUpdateSettingsFrame(ServiceUpdateSettingsFrame): + pass + + +@dataclass +class STTUpdateSettingsFrame(ServiceUpdateSettingsFrame): + pass + + @dataclass class FunctionCallInProgressFrame(SystemFrame): """A frame signaling that a function call is in progress.""" diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 6ddc4d426..deebc4345 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -8,7 +8,7 @@ import io import wave from abc import abstractmethod -from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union +from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple from loguru import logger @@ -19,14 +19,15 @@ ErrorFrame, Frame, LLMFullResponseEndFrame, - ServiceUpdateSettingsFrame, StartFrame, StartInterruptionFrame, + STTUpdateSettingsFrame, TextFrame, TTSAudioRawFrame, TTSSpeakFrame, TTSStartedFrame, TTSStoppedFrame, + TTSUpdateSettingsFrame, UserImageRequestFrame, VisionImageRawFrame, ) @@ -175,70 +176,10 @@ def __init__( self._current_sentence: str = "" - @property - def sample_rate(self) -> int: - return self._sample_rate - @abstractmethod async def set_model(self, model: str): self.set_model_name(model) - @abstractmethod - async def set_voice(self, voice: str): - pass - - @abstractmethod - async def set_language(self, language: Language): - pass - - @abstractmethod - async def set_speed(self, speed: Union[str, float]): - pass - - @abstractmethod - async def set_emotion(self, emotion: List[str]): - pass - - @abstractmethod - async def set_engine(self, engine: str): - pass - - @abstractmethod - async def set_pitch(self, pitch: str): - pass - - @abstractmethod - async def set_rate(self, rate: str): - pass - - @abstractmethod - async def set_volume(self, volume: str): - pass - - @abstractmethod - async def set_emphasis(self, emphasis: str): - pass - - @abstractmethod - async def set_style(self, style: str): - pass - - @abstractmethod - async def set_style_degree(self, style_degree: str): - pass - - @abstractmethod - async def set_role(self, role: str): - pass - - @abstractmethod - async def set_gender(self, gender: str): - pass - - @abstractmethod - async def set_google_style(self, google_style: str): - pass - @abstractmethod async def flush_audio(self): pass @@ -269,20 +210,16 @@ async def cancel(self, frame: CancelFrame): async def _update_settings(self, settings: Dict[str, Any]): for key, value in settings.items(): - setter = getattr(self, f"set_{key}", None) - if setter and callable(setter): - try: - if key == "language": - await setter(Language(value)) - else: - await setter(value) - except Exception as e: - logger.warning(f"Error setting {key}: {e}") + if key in self._settings: + logger.debug(f"Updating TTS setting {key} to: [{value}]") + self._settings[key] = value + if key == "model": + self.set_model_name(value) + elif key == "language": + self._settings[key] = Language(value) else: logger.warning(f"Unknown setting for TTS service: {key}") - self._settings.update(settings) - async def say(self, text: str): aggregate_sentences = self._aggregate_sentences self._aggregate_sentences = False @@ -309,7 +246,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): elif isinstance(frame, TTSSpeakFrame): await self._push_tts_frames(frame.text) await self.flush_audio() - elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "tts": + elif isinstance(frame, TTSUpdateSettingsFrame): await self._update_settings(frame.settings) else: await self.push_frame(frame, direction) @@ -448,10 +385,6 @@ def __init__(self, **kwargs): async def set_model(self, model: str): self.set_model_name(model) - @abstractmethod - async def set_language(self, language: Language): - pass - @abstractmethod async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: """Returns transcript as a string""" @@ -459,20 +392,16 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: async def _update_settings(self, settings: Dict[str, Any]): for key, value in settings.items(): - setter = getattr(self, f"set_{key}", None) - if setter and callable(setter): - try: - if key == "language": - await setter(Language(value)) - else: - await setter(value) - except Exception as e: - logger.warning(f"Error setting {key}: {e}") + if key in self._settings: + logger.debug(f"Updating STT setting {key} to: [{value}]") + self._settings[key] = value + if key == "model": + self.set_model_name(value) + elif key == "language": + self._settings[key] = Language(value) else: logger.warning(f"Unknown setting for STT service: {key}") - self._settings.update(settings) - async def process_audio_frame(self, frame: AudioRawFrame): await self.process_generator(self.run_stt(frame.audio)) @@ -484,7 +413,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # In this service we accumulate audio internally and at the end we # push a TextFrame. We don't really want to push audio frames down. await self.process_audio_frame(frame) - elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "stt": + elif isinstance(frame, STTUpdateSettingsFrame): await self._update_settings(frame.settings) else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py index 09d6c5402..8b4253c3e 100644 --- a/src/pipecat/services/anthropic.py +++ b/src/pipecat/services/anthropic.py @@ -25,7 +25,7 @@ LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesFrame, - ServiceUpdateSettingsFrame, + LLMUpdateSettingsFrame, StartInterruptionFrame, TextFrame, UserImageRawFrame, @@ -96,12 +96,15 @@ def __init__( super().__init__(**kwargs) self._client = AsyncAnthropic(api_key=api_key) self.set_model_name(model) - self._max_tokens = params.max_tokens - self._enable_prompt_caching_beta: bool = params.enable_prompt_caching_beta or False - self._temperature = params.temperature - self._top_k = params.top_k - self._top_p = params.top_p - self._extra = params.extra if isinstance(params.extra, dict) else {} + self._settings = { + "model": model, + "max_tokens": params.max_tokens, + "enable_prompt_caching_beta": params.enable_prompt_caching_beta or False, + "temperature": params.temperature, + "top_k": params.top_k, + "top_p": params.top_p, + "extra": params.extra if isinstance(params.extra, dict) else {}, + } def can_generate_metrics(self) -> bool: return True @@ -120,30 +123,6 @@ def create_context_aggregator( ) return AnthropicContextAggregatorPair(_user=user, _assistant=assistant) - async def set_enable_prompt_caching_beta(self, enable_prompt_caching_beta: bool): - logger.debug(f"Switching LLM enable_prompt_caching_beta to: [{enable_prompt_caching_beta}]") - self._enable_prompt_caching_beta = enable_prompt_caching_beta - - async def set_max_tokens(self, max_tokens: int): - logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]") - self._max_tokens = max_tokens - - async def set_temperature(self, temperature: float): - logger.debug(f"Switching LLM temperature to: [{temperature}]") - self._temperature = temperature - - async def set_top_k(self, top_k: float): - logger.debug(f"Switching LLM top_k to: [{top_k}]") - self._top_k = top_k - - async def set_top_p(self, top_p: float): - logger.debug(f"Switching LLM top_p to: [{top_p}]") - self._top_p = top_p - - async def set_extra(self, extra: Dict[str, Any]): - logger.debug(f"Switching LLM extra to: [{extra}]") - self._extra = extra - async def _process_context(self, context: OpenAILLMContext): # Usage tracking. We track the usage reported by Anthropic in prompt_tokens and # completion_tokens. We also estimate the completion tokens from output text @@ -165,11 +144,11 @@ async def _process_context(self, context: OpenAILLMContext): ) messages = context.messages - if self._enable_prompt_caching_beta: + if self._settings["enable_prompt_caching_beta"]: messages = context.get_messages_with_cache_control_markers() api_call = self._client.messages.create - if self._enable_prompt_caching_beta: + if self._settings["enable_prompt_caching_beta"]: api_call = self._client.beta.prompt_caching.messages.create await self.start_ttfb_metrics() @@ -179,14 +158,14 @@ async def _process_context(self, context: OpenAILLMContext): "system": context.system, "messages": messages, "model": self.model_name, - "max_tokens": self._max_tokens, + "max_tokens": self._settings["max_tokens"], "stream": True, - "temperature": self._temperature, - "top_k": self._top_k, - "top_p": self._top_p, + "temperature": self._settings["temperature"], + "top_k": self._settings["top_k"], + "top_p": self._settings["top_p"], } - params.update(self._extra) + params.update(self._settings["extra"]) response = await api_call(**params) @@ -286,12 +265,11 @@ async def _process_context(self, context: OpenAILLMContext): async def _update_settings(self, settings: Dict[str, Any]): for key, value in settings.items(): - setter = getattr(self, f"set_{key}", None) - if setter and callable(setter): - try: - await setter(value) - except Exception as e: - logger.warning(f"Error setting {key}: {e}") + if key in self._settings: + logger.debug(f"Updating LLM setting {key} to: [{value}]") + self._settings[key] = value + if key == "model": + self.set_model_name(value) else: logger.warning(f"Unknown setting for Anthropic LLM service: {key}") @@ -309,11 +287,11 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # UserImageRawFrames coming through the pipeline and add them # to the context. context = AnthropicLLMContext.from_image_frame(frame) - elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm": + elif isinstance(frame, LLMUpdateSettingsFrame): await self._update_settings(frame.settings) elif isinstance(frame, LLMEnablePromptCachingFrame): logger.debug(f"Setting enable prompt caching to: [{frame.enable}]") - self._enable_prompt_caching_beta = frame.enable + self._settings["enable_prompt_caching_beta"] = frame.enable else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py index 80240985f..f3cc014b1 100644 --- a/src/pipecat/services/aws.py +++ b/src/pipecat/services/aws.py @@ -6,6 +6,7 @@ from typing import AsyncGenerator, Optional +from loguru import logger from pydantic import BaseModel from pipecat.frames.frames import ( @@ -17,8 +18,6 @@ ) from pipecat.services.ai_services import TTSService -from loguru import logger - try: import boto3 from botocore.exceptions import BotoCoreError, ClientError @@ -57,9 +56,15 @@ def __init__( aws_secret_access_key=api_key, region_name=region, ) - self._voice_id = voice_id - self._sample_rate = sample_rate - self._params = params + self._settings = { + "voice_id": voice_id, + "sample_rate": sample_rate, + "engine": params.engine, + "language": params.language, + "pitch": params.pitch, + "rate": params.rate, + "volume": params.volume, + } def can_generate_metrics(self) -> bool: return True @@ -67,18 +72,18 @@ def can_generate_metrics(self) -> bool: def _construct_ssml(self, text: str) -> str: ssml = "" - if self._params.language: - ssml += f"" + if self._settings["language"]: + ssml += f"" prosody_attrs = [] # Prosody tags are only supported for standard and neural engines - if self._params.engine != "generative": - if self._params.rate: - prosody_attrs.append(f"rate='{self._params.rate}'") - if self._params.pitch: - prosody_attrs.append(f"pitch='{self._params.pitch}'") - if self._params.volume: - prosody_attrs.append(f"volume='{self._params.volume}'") + if self._settings["engine"] != "generative": + if self._settings["rate"]: + prosody_attrs.append(f"rate='{self._settings["rate"]}'") + if self._settings["pitch"]: + prosody_attrs.append(f"pitch='{self._settings["pitch"]}'") + if self._settings["volume"]: + prosody_attrs.append(f"volume='{self._settings["volume"]}'") if prosody_attrs: ssml += f"" @@ -90,41 +95,13 @@ def _construct_ssml(self, text: str) -> str: if prosody_attrs: ssml += "" - if self._params.language: + if self._settings["language"]: ssml += "" ssml += "" return ssml - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - - async def set_engine(self, engine: str): - logger.debug(f"Switching TTS engine to: [{engine}]") - self._params.engine = engine - - async def set_language(self, language: str): - logger.debug(f"Switching TTS language to: [{language}]") - self._params.language = language - - async def set_pitch(self, pitch: str): - logger.debug(f"Switching TTS pitch to: [{pitch}]") - self._params.pitch = pitch - - async def set_rate(self, rate: str): - logger.debug(f"Switching TTS rate to: [{rate}]") - self._params.rate = rate - - async def set_volume(self, volume: str): - logger.debug(f"Switching TTS volume to: [{volume}]") - self._params.volume = volume - - async def set_params(self, params: InputParams): - logger.debug(f"Switching TTS params to: [{params}]") - self._params = params - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") @@ -138,9 +115,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: "Text": ssml, "TextType": "ssml", "OutputFormat": "pcm", - "VoiceId": self._voice_id, - "Engine": self._params.engine, - "SampleRate": str(self._sample_rate), + "VoiceId": self._settings["voice_id"], + "Engine": self._settings["engine"], + "SampleRate": str(self._settings["sample_rate"]), } # Filter out None values @@ -160,7 +137,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: chunk = audio_data[i : i + chunk_size] if len(chunk) > 0: await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame(chunk, self._sample_rate, 1) + frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1) yield frame await self.push_frame(TTSStoppedFrame()) diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index a1349cefe..19cd1889a 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -4,12 +4,13 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import aiohttp import asyncio import io - from typing import AsyncGenerator, Optional +import aiohttp +from loguru import logger +from PIL import Image from pydantic import BaseModel from pipecat.frames.frames import ( @@ -28,10 +29,6 @@ from pipecat.services.openai import BaseOpenAILLMService from pipecat.utils.time import time_now_iso8601 -from PIL import Image - -from loguru import logger - # See .env.example for Azure configuration needed try: from azure.cognitiveservices.speech import ( @@ -99,114 +96,66 @@ def __init__( speech_config = SpeechConfig(subscription=api_key, region=region) self._speech_synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None) - self._voice = voice - self._sample_rate = sample_rate - self._params = params + self._settings = { + "voice": voice, + "sample_rate": sample_rate, + "emphasis": params.emphasis, + "language": params.language, + "pitch": params.pitch, + "rate": params.rate, + "role": params.role, + "style": params.style, + "style_degree": params.style_degree, + "volume": params.volume, + } def can_generate_metrics(self) -> bool: return True def _construct_ssml(self, text: str) -> str: ssml = ( - f"" - f"" + f"" "" ) - if self._params.style: - ssml += f"" - if self._params.emphasis: - ssml += f"" + if self._settings["emphasis"]: + ssml += f"" ssml += text - if self._params.emphasis: + if self._settings["emphasis"]: ssml += "" ssml += "" - if self._params.style: + if self._settings["style"]: ssml += "" ssml += "" return ssml - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice = voice - - async def set_emphasis(self, emphasis: str): - logger.debug(f"Setting TTS emphasis to: [{emphasis}]") - self._params.emphasis = emphasis - - async def set_language(self, language: str): - logger.debug(f"Setting TTS language code to: [{language}]") - self._params.language = language - - async def set_pitch(self, pitch: str): - logger.debug(f"Setting TTS pitch to: [{pitch}]") - self._params.pitch = pitch - - async def set_rate(self, rate: str): - logger.debug(f"Setting TTS rate to: [{rate}]") - self._params.rate = rate - - async def set_role(self, role: str): - logger.debug(f"Setting TTS role to: [{role}]") - self._params.role = role - - async def set_style(self, style: str): - logger.debug(f"Setting TTS style to: [{style}]") - self._params.style = style - - async def set_style_degree(self, style_degree: str): - logger.debug(f"Setting TTS style degree to: [{style_degree}]") - self._params.style_degree = style_degree - - async def set_volume(self, volume: str): - logger.debug(f"Setting TTS volume to: [{volume}]") - self._params.volume = volume - - async def set_params(self, **kwargs): - valid_params = { - "voice": self.set_voice, - "emphasis": self.set_emphasis, - "language_code": self.set_language, - "pitch": self.set_pitch, - "rate": self.set_rate, - "role": self.set_role, - "style": self.set_style, - "style_degree": self.set_style_degree, - "volume": self.set_volume, - } - - for param, value in kwargs.items(): - if param in valid_params: - await valid_params[param](value) - else: - logger.warning(f"Ignoring unknown parameter: {param}") - - logger.debug(f"Updated TTS parameters: {', '.join(kwargs.keys())}") - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") @@ -222,7 +171,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.push_frame(TTSStartedFrame()) # Azure always sends a 44-byte header. Strip it off. yield TTSAudioRawFrame( - audio=result.audio_data[44:], sample_rate=self._sample_rate, num_channels=1 + audio=result.audio_data[44:], + sample_rate=self._settings["sample_rate"], + num_channels=1, ) await self.push_frame(TTSStoppedFrame()) elif result.reason == ResultReason.Canceled: diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 5f798b1e5..1bbe22953 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -4,36 +4,35 @@ # SPDX-License-Identifier: BSD 2-Clause License # +import asyncio +import base64 import json import uuid -import base64 -import asyncio +from typing import AsyncGenerator, List, Optional, Union -from typing import AsyncGenerator, Optional, Union, List +from loguru import logger from pydantic.main import BaseModel from pipecat.frames.frames import ( CancelFrame, + EndFrame, ErrorFrame, Frame, - StartInterruptionFrame, + LLMFullResponseEndFrame, StartFrame, - EndFrame, + StartInterruptionFrame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, - LLMFullResponseEndFrame, ) from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.ai_services import TTSService, WordTTSService from pipecat.transcriptions.language import Language -from pipecat.services.ai_services import WordTTSService, TTSService - -from loguru import logger # See .env.example for Cartesia configuration needed try: - from cartesia import AsyncCartesia import websockets + from cartesia import AsyncCartesia except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( @@ -101,17 +100,19 @@ def __init__( self._api_key = api_key self._cartesia_version = cartesia_version self._url = url - self._voice_id = voice_id - self._model_id = model_id - self.set_model_name(model_id) - self._output_format = { - "container": params.container, - "encoding": params.encoding, - "sample_rate": params.sample_rate, + self._settings = { + "voice_id": voice_id, + "model_id": model_id, + "output_format": { + "container": params.container, + "encoding": params.encoding, + "sample_rate": params.sample_rate, + }, + "language": params.language, + "speed": params.speed, + "emotion": params.emotion, } - self._language = params.language - self._speed = params.speed - self._emotion = params.emotion + self.set_model_name(model_id) self._websocket = None self._context_id = None @@ -121,37 +122,21 @@ def can_generate_metrics(self) -> bool: return True async def set_model(self, model: str): - self._model_id = model + self._settings["model_id"] = model await super().set_model(model) logger.debug(f"Switching TTS model to: [{model}]") - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - - async def set_speed(self, speed: str): - logger.debug(f"Switching TTS speed to: [{speed}]") - self._speed = speed - - async def set_emotion(self, emotion: list[str]): - logger.debug(f"Switching TTS emotion to: [{emotion}]") - self._emotion = emotion - - async def set_language(self, language: Language): - logger.debug(f"Switching TTS language to: [{language}]") - self._language = language_to_cartesia_language(language) - def _build_msg( self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True ): - voice_config = {"mode": "id", "id": self._voice_id} + voice_config = {"mode": "id", "id": self._settings["voice_id"]} - if self._speed or self._emotion: + if self._settings["speed"] or self._settings["emotion"]: voice_config["__experimental_controls"] = {} - if self._speed: - voice_config["__experimental_controls"]["speed"] = self._speed - if self._emotion: - voice_config["__experimental_controls"]["emotion"] = self._emotion + if self._settings["speed"]: + voice_config["__experimental_controls"]["speed"] = self._settings["speed"] + if self._settings["emotion"]: + voice_config["__experimental_controls"]["emotion"] = self._settings["emotion"] msg = { "transcript": text, @@ -159,8 +144,8 @@ def _build_msg( "context_id": self._context_id, "model_id": self._model_name, "voice": voice_config, - "output_format": self._output_format, - "language": self._language, + "output_format": self._settings["output_format"], + "language": self._settings["language"], "add_timestamps": add_timestamps, } return json.dumps(msg) @@ -245,7 +230,7 @@ async def _receive_task_handler(self): self.start_word_timestamps() frame = TTSAudioRawFrame( audio=base64.b64decode(msg["data"]), - sample_rate=self._output_format["sample_rate"], + sample_rate=self._settings["output_format"]["sample_rate"], num_channels=1, ) await self.push_frame(frame) @@ -311,17 +296,19 @@ def __init__( super().__init__(**kwargs) self._api_key = api_key - self._voice_id = voice_id - self._model_id = model_id - self.set_model_name(model_id) - self._output_format = { - "container": params.container, - "encoding": params.encoding, - "sample_rate": params.sample_rate, + self._settings = { + "voice_id": voice_id, + "model_id": model_id, + "output_format": { + "container": params.container, + "encoding": params.encoding, + "sample_rate": params.sample_rate, + }, + "language": params.language, + "speed": params.speed, + "emotion": params.emotion, } - self._language = params.language - self._speed = params.speed - self._emotion = params.emotion + self.set_model_name(model_id) self._client = AsyncCartesia(api_key=api_key, base_url=base_url) @@ -333,22 +320,6 @@ async def set_model(self, model: str): self._model_id = model await super().set_model(model) - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - - async def set_speed(self, speed: str): - logger.debug(f"Switching TTS speed to: [{speed}]") - self._speed = speed - - async def set_emotion(self, emotion: list[str]): - logger.debug(f"Switching TTS emotion to: [{emotion}]") - self._emotion = emotion - - async def set_language(self, language: Language): - logger.debug(f"Switching TTS language to: [{language}]") - self._language = language_to_cartesia_language(language) - async def stop(self, frame: EndFrame): await super().stop(frame) await self._client.close() @@ -365,19 +336,19 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: try: voice_controls = None - if self._speed or self._emotion: + if self._settings["speed"] or self._settings["emotion"]: voice_controls = {} - if self._speed: - voice_controls["speed"] = self._speed - if self._emotion: - voice_controls["emotion"] = self._emotion + if self._settings["speed"]: + voice_controls["speed"] = self._settings["speed"] + if self._settings["emotion"]: + voice_controls["emotion"] = self._settings["emotion"] output = await self._client.tts.sse( - model_id=self._model_id, + model_id=self._settings["model_id"], transcript=text, - voice_id=self._voice_id, - output_format=self._output_format, - language=self._language, + voice_id=self._settings["voice_id"], + output_format=self._settings["output_format"], + language=self._settings["language"], stream=False, _experimental_voice_controls=voice_controls, ) @@ -386,7 +357,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: frame = TTSAudioRawFrame( audio=output["audio"], - sample_rate=self._output_format["sample_rate"], + sample_rate=self._settings["output_format"]["sample_rate"], num_channels=1, ) yield frame diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index d109cce3c..28f472e55 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -5,9 +5,10 @@ # import asyncio - from typing import AsyncGenerator +from loguru import logger + from pipecat.frames.frames import ( CancelFrame, EndFrame, @@ -24,8 +25,6 @@ from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 -from loguru import logger - # See .env.example for Deepgram configuration needed try: from deepgram import ( @@ -57,25 +56,23 @@ def __init__( ): super().__init__(**kwargs) - self._voice = voice - self._sample_rate = sample_rate - self._encoding = encoding + self._settings = { + "voice": voice, + "sample_rate": sample_rate, + "encoding": encoding, + } self._deepgram_client = DeepgramClient(api_key=api_key) def can_generate_metrics(self) -> bool: return True - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice = voice - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") options = SpeakOptions( - model=self._voice, - encoding=self._encoding, - sample_rate=self._sample_rate, + model=self._settings["voice"], + encoding=self._settings["encoding"], + sample_rate=self._settings["sample_rate"], container="none", ) @@ -103,7 +100,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: chunk = audio_buffer.read(chunk_size) if not chunk: break - frame = TTSAudioRawFrame(audio=chunk, sample_rate=self._sample_rate, num_channels=1) + frame = TTSAudioRawFrame( + audio=chunk, sample_rate=self._settings["sample_rate"], num_channels=1 + ) yield frame await self.push_frame(TTSStoppedFrame()) @@ -135,7 +134,9 @@ def __init__( ): super().__init__(**kwargs) - self._live_options = live_options + self._settings = { + "live_options": live_options, + } self._client = DeepgramClient( api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"}) @@ -147,7 +148,7 @@ def __init__( @property def vad_enabled(self): - return self._live_options.vad_events + return self._settings["live_options"].vad_events def can_generate_metrics(self) -> bool: return self.vad_enabled @@ -155,13 +156,7 @@ def can_generate_metrics(self) -> bool: async def set_model(self, model: str): await super().set_model(model) logger.debug(f"Switching STT model to: [{model}]") - self._live_options.model = model - await self._disconnect() - await self._connect() - - async def set_language(self, language: Language): - logger.debug(f"Switching STT language to: [{language}]") - self._live_options.language = language + self._settings["live_options"].model = model await self._disconnect() await self._connect() @@ -182,7 +177,7 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: yield None async def _connect(self): - if await self._connection.start(self._live_options): + if await self._connection.start(self._settings["live_options"]): logger.debug(f"{self}: Connected to Deepgram") else: logger.error(f"{self}: Unable to connect to Deepgram") diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py index 611f2a024..6b05a1f22 100644 --- a/src/pipecat/services/elevenlabs.py +++ b/src/pipecat/services/elevenlabs.py @@ -124,10 +124,20 @@ def __init__( ) self._api_key = api_key - self._voice_id = voice_id - self.set_model_name(model) self._url = url - self._params = params + self._settings = { + "voice_id": voice_id, + "sample_rate": sample_rate_from_output_format(params.output_format), + "model": model, + "language": params.language, + "output_format": params.output_format, + "optimize_streaming_latency": params.optimize_streaming_latency, + "stability": params.stability, + "similarity_boost": params.similarity_boost, + "style": params.style, + "use_speaker_boost": params.use_speaker_boost, + } + self.set_model_name(model) self._voice_settings = self._set_voice_settings() # Websocket connection to ElevenLabs. @@ -142,19 +152,22 @@ def can_generate_metrics(self) -> bool: def _set_voice_settings(self): voice_settings = {} - if self._params.stability is not None and self._params.similarity_boost is not None: - voice_settings["stability"] = self._params.stability - voice_settings["similarity_boost"] = self._params.similarity_boost - if self._params.style is not None: - voice_settings["style"] = self._params.style - if self._params.use_speaker_boost is not None: - voice_settings["use_speaker_boost"] = self._params.use_speaker_boost + if ( + self._settings["stability"] is not None + and self._settings["similarity_boost"] is not None + ): + voice_settings["stability"] = self._settings["stability"] + voice_settings["similarity_boost"] = self._settings["similarity_boost"] + if self._settings["style"] is not None: + voice_settings["style"] = self._settings["style"] + if self._settings["use_speaker_boost"] is not None: + voice_settings["use_speaker_boost"] = self._settings["use_speaker_boost"] else: - if self._params.style is not None: + if self._settings["style"] is not None: logger.warning( "'style' is set but will not be applied because 'stability' and 'similarity_boost' are not both set." ) - if self._params.use_speaker_boost is not None: + if self._settings["use_speaker_boost"] is not None: logger.warning( "'use_speaker_boost' is set but will not be applied because 'stability' and 'similarity_boost' are not both set." ) @@ -167,34 +180,6 @@ async def set_model(self, model: str): await self._disconnect() await self._connect() - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - await self._disconnect() - await self._connect() - - async def set_voice_settings( - self, - stability: Optional[float] = None, - similarity_boost: Optional[float] = None, - style: Optional[float] = None, - use_speaker_boost: Optional[bool] = None, - ): - self._params.stability = stability if stability is not None else self._params.stability - self._params.similarity_boost = ( - similarity_boost if similarity_boost is not None else self._params.similarity_boost - ) - self._params.style = style if style is not None else self._params.style - self._params.use_speaker_boost = ( - use_speaker_boost if use_speaker_boost is not None else self._params.use_speaker_boost - ) - - self._set_voice_settings() - - if self._websocket: - msg = {"voice_settings": self._voice_settings} - await self._websocket.send(json.dumps(msg)) - async def start(self, frame: StartFrame): await super().start(frame) await self._connect() @@ -221,21 +206,21 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect async def _connect(self): try: - voice_id = self._voice_id - model = self.model_name - output_format = self._params.output_format + voice_id = self._settings["voice_id"] + model = self._settings["model"] + output_format = self._settings["output_format"] url = f"{self._url}/v1/text-to-speech/{voice_id}/stream-input?model_id={model}&output_format={output_format}" - if self._params.optimize_streaming_latency: - url += f"&optimize_streaming_latency={self._params.optimize_streaming_latency}" + if self._settings["optimize_streaming_latency"]: + url += f"&optimize_streaming_latency={self._settings["optimize_streaming_latency"]}" # language can only be used with the 'eleven_turbo_v2_5' model - if self._params.language: + if self._settings["language"]: if model == "eleven_turbo_v2_5": - url += f"&language_code={self._params.language}" + url += f"&language_code={self._settings["language"]}" else: logger.debug( - f"Language code [{self._params.language}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model." + f"Language code [{self._settings["language"]}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model." ) self._websocket = await websockets.connect(url) @@ -286,7 +271,7 @@ async def _receive_task_handler(self): self.start_word_timestamps() audio = base64.b64decode(msg["audio"]) - frame = TTSAudioRawFrame(audio, self.sample_rate, 1) + frame = TTSAudioRawFrame(audio, self._settings["sample_rate"], 1) await self.push_frame(frame) if msg.get("alignment"): diff --git a/src/pipecat/services/gladia.py b/src/pipecat/services/gladia.py index a590d73cf..7537b780c 100644 --- a/src/pipecat/services/gladia.py +++ b/src/pipecat/services/gladia.py @@ -6,8 +6,9 @@ import base64 import json - from typing import AsyncGenerator, Optional + +from loguru import logger from pydantic.main import BaseModel from pipecat.frames.frames import ( @@ -21,8 +22,6 @@ from pipecat.services.ai_services import STTService from pipecat.utils.time import time_now_iso8601 -from loguru import logger - # See .env.example for Gladia configuration needed try: import websockets @@ -55,8 +54,14 @@ def __init__( self._api_key = api_key self._url = url - self._params = params - self._confidence = confidence + self._settings = { + "confidence": confidence, + "sample_rate": params.sample_rate, + "language": params.language, + "transcription_hint": params.transcription_hint, + "endpointing": params.endpointing, + "prosody": params.prosody, + } async def start(self, frame: StartFrame): await super().start(frame) @@ -84,9 +89,15 @@ async def _setup_gladia(self): "encoding": "WAV/PCM", "model_type": "fast", "language_behaviour": "manual", - **self._params.model_dump(exclude_none=True), + "sample_rate": self._settings["sample_rate"], + "language": self._settings["language"], + "transcription_hint": self._settings["transcription_hint"], + "endpointing": self._settings["endpointing"], + "prosody": self._settings["prosody"], } + configuration = {k: v for k, v in configuration.items() if v is not None} + await self._websocket.send(json.dumps(configuration)) async def _send_audio(self, audio: bytes): @@ -106,7 +117,7 @@ async def _receive_task_handler(self): type = utterance["type"] confidence = utterance["confidence"] transcript = utterance["transcription"] - if confidence >= self._confidence: + if confidence >= self._settings["confidence"]: if type == "final": await self.push_frame( TranscriptionFrame(transcript, "", time_now_iso8601()) diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index 092b4703e..a759e4920 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -17,7 +17,7 @@ LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesFrame, - ServiceUpdateSettingsFrame, + LLMUpdateSettingsFrame, TextFrame, TTSAudioRawFrame, TTSStartedFrame, @@ -55,6 +55,9 @@ class GoogleLLMService(LLMService): def __init__(self, *, api_key: str, model: str = "gemini-1.5-flash-latest", **kwargs): super().__init__(**kwargs) gai.configure(api_key=api_key) + self._settings = { + "model": model, + } self._create_client(model) def can_generate_metrics(self) -> bool: @@ -64,18 +67,13 @@ def _create_client(self, model: str): self.set_model_name(model) self._client = gai.GenerativeModel(model) - async def set_model(self, model: str): - logger.debug(f"Switching LLM model to: [{model}]") - self._create_client(model) - async def _update_settings(self, settings: Dict[str, Any]): for key, value in settings.items(): - setter = getattr(self, f"set_{key}", None) - if setter and callable(setter): - try: - await setter(value) - except Exception as e: - logger.warning(f"Error setting {key}: {e}") + if key in self._settings: + logger.debug(f"Updating LLM setting {key} to: [{value}]") + self._settings[key] = value + if key == "model": + self._create_client(value) else: logger.warning(f"Unknown setting for Google LLM service: {key}") @@ -151,7 +149,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): context = OpenAILLMContext.from_messages(frame.messages) elif isinstance(frame, VisionImageRawFrame): context = OpenAILLMContext.from_image_frame(frame) - elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm": + elif isinstance(frame, LLMUpdateSettingsFrame): await self._update_settings(frame.settings) else: await self.push_frame(frame, direction) @@ -182,8 +180,17 @@ def __init__( ): super().__init__(sample_rate=sample_rate, **kwargs) - self._voice_id: str = voice_id - self._params = params + self._settings = { + "voice_id": voice_id, + "sample_rate": sample_rate, + "pitch": params.pitch, + "rate": params.rate, + "volume": params.volume, + "emphasis": params.emphasis, + "language": params.language, + "gender": params.gender, + "google_style": params.google_style, + } self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client( credentials, credentials_path ) @@ -215,39 +222,39 @@ def _construct_ssml(self, text: str) -> str: ssml = "" # Voice tag - voice_attrs = [f"name='{self._voice_id}'"] - if self._params.language: - voice_attrs.append(f"language='{self._params.language}'") - if self._params.gender: - voice_attrs.append(f"gender='{self._params.gender}'") + voice_attrs = [f"name='{self._settings["voice_id"]}'"] + if self._settings["language"]: + voice_attrs.append(f"language='{self._settings['language']}'") + if self._settings["gender"]: + voice_attrs.append(f"gender='{self._settings['gender']}'") ssml += f"" # Prosody tag prosody_attrs = [] - if self._params.pitch: - prosody_attrs.append(f"pitch='{self._params.pitch}'") - if self._params.rate: - prosody_attrs.append(f"rate='{self._params.rate}'") - if self._params.volume: - prosody_attrs.append(f"volume='{self._params.volume}'") + if self._settings["pitch"]: + prosody_attrs.append(f"pitch='{self._settings['pitch']}'") + if self._settings["rate"]: + prosody_attrs.append(f"rate='{self._settings['rate']}'") + if self._settings["volume"]: + prosody_attrs.append(f"volume='{self._settings['volume']}'") if prosody_attrs: ssml += f"" # Emphasis tag - if self._params.emphasis: - ssml += f"" + if self._settings["emphasis"]: + ssml += f"" # Google style tag - if self._params.google_style: - ssml += f"" + if self._settings["google_style"]: + ssml += f"" ssml += text # Close tags - if self._params.google_style: + if self._settings["google_style"]: ssml += "" - if self._params.emphasis: + if self._settings["emphasis"]: ssml += "" if prosody_attrs: ssml += "" @@ -255,46 +262,6 @@ def _construct_ssml(self, text: str) -> str: return ssml - async def set_voice(self, voice: str) -> None: - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - - async def set_language(self, language: str) -> None: - logger.debug(f"Switching TTS language to: [{language}]") - self._params.language = language - - async def set_pitch(self, pitch: str) -> None: - logger.debug(f"Switching TTS pitch to: [{pitch}]") - self._params.pitch = pitch - - async def set_rate(self, rate: str) -> None: - logger.debug(f"Switching TTS rate to: [{rate}]") - self._params.rate = rate - - async def set_volume(self, volume: str) -> None: - logger.debug(f"Switching TTS volume to: [{volume}]") - self._params.volume = volume - - async def set_emphasis( - self, emphasis: Literal["strong", "moderate", "reduced", "none"] - ) -> None: - logger.debug(f"Switching TTS emphasis to: [{emphasis}]") - self._params.emphasis = emphasis - - async def set_gender(self, gender: Literal["male", "female", "neutral"]) -> None: - logger.debug(f"Switch TTS gender to [{gender}]") - self._params.gender = gender - - async def set_google_style( - self, google_style: Literal["apologetic", "calm", "empathetic", "firm", "lively"] - ) -> None: - logger.debug(f"Switching TTS google style to: [{google_style}]") - self._params.google_style = google_style - - async def set_params(self, params: InputParams) -> None: - logger.debug(f"Switching TTS params to: [{params}]") - self._params = params - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") @@ -304,11 +271,11 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: ssml = self._construct_ssml(text) synthesis_input = texttospeech_v1.SynthesisInput(ssml=ssml) voice = texttospeech_v1.VoiceSelectionParams( - language_code=self._params.language, name=self._voice_id + language_code=self._settings["language"], name=self._settings["voice_id"] ) audio_config = texttospeech_v1.AudioConfig( audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16, - sample_rate_hertz=self.sample_rate, + sample_rate_hertz=self._settings["sample_rate"], ) request = texttospeech_v1.SynthesizeSpeechRequest( @@ -331,7 +298,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: if not chunk: break await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame(chunk, self.sample_rate, 1) + frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1) yield frame await asyncio.sleep(0) # Allow other tasks to run diff --git a/src/pipecat/services/lmnt.py b/src/pipecat/services/lmnt.py index 8f18002c5..2427aeb07 100644 --- a/src/pipecat/services/lmnt.py +++ b/src/pipecat/services/lmnt.py @@ -5,10 +5,10 @@ # import asyncio - from typing import AsyncGenerator -from pipecat.processors.frame_processor import FrameDirection +from loguru import logger + from pipecat.frames.frames import ( CancelFrame, EndFrame, @@ -20,10 +20,9 @@ TTSStartedFrame, TTSStoppedFrame, ) +from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import TTSService -from loguru import logger - # See .env.example for LMNT configuration needed try: from lmnt.api import Speech @@ -50,13 +49,15 @@ def __init__( super().__init__(push_stop_frames=True, sample_rate=sample_rate, **kwargs) self._api_key = api_key - self._voice_id = voice_id - self._output_format = { - "container": "raw", - "encoding": "pcm_s16le", - "sample_rate": sample_rate, + self._settings = { + "voice_id": voice_id, + "output_format": { + "container": "raw", + "encoding": "pcm_s16le", + "sample_rate": sample_rate, + }, + "language": language, } - self._language = language self._speech = None self._connection = None @@ -68,10 +69,6 @@ def __init__( def can_generate_metrics(self) -> bool: return True - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - async def start(self, frame: StartFrame): await super().start(frame) await self._connect() @@ -93,7 +90,9 @@ async def _connect(self): try: self._speech = Speech() self._connection = await self._speech.synthesize_streaming( - self._voice_id, format="raw", sample_rate=self._output_format["sample_rate"] + self._settings["voice_id"], + format="raw", + sample_rate=self._settings["output_format"]["sample_rate"], ) self._receive_task = self.get_event_loop().create_task(self._receive_task_handler()) except Exception as e: @@ -130,7 +129,7 @@ async def _receive_task_handler(self): await self.stop_ttfb_metrics() frame = TTSAudioRawFrame( audio=msg["audio"], - sample_rate=self._output_format["sample_rate"], + sample_rate=self._settings["output_format"]["sample_rate"], num_channels=1, ) await self.push_frame(frame) diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index 57c1b3399..8b479294a 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -24,7 +24,7 @@ LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesFrame, - ServiceUpdateSettingsFrame, + LLMUpdateSettingsFrame, StartInterruptionFrame, TextFrame, TTSAudioRawFrame, @@ -111,14 +111,17 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + self._settings = { + "model": model, + "frequency_penalty": params.frequency_penalty, + "presence_penalty": params.presence_penalty, + "seed": params.seed, + "temperature": params.temperature, + "top_p": params.top_p, + "extra": params.extra if isinstance(params.extra, dict) else {}, + } self.set_model_name(model) self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs) - self._frequency_penalty = params.frequency_penalty - self._presence_penalty = params.presence_penalty - self._seed = params.seed - self._temperature = params.temperature - self._top_p = params.top_p - self._extra = params.extra if isinstance(params.extra, dict) else {} def create_client(self, api_key=None, base_url=None, **kwargs): return AsyncOpenAI( @@ -134,30 +137,6 @@ def create_client(self, api_key=None, base_url=None, **kwargs): def can_generate_metrics(self) -> bool: return True - async def set_frequency_penalty(self, frequency_penalty: float): - logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]") - self._frequency_penalty = frequency_penalty - - async def set_presence_penalty(self, presence_penalty: float): - logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]") - self._presence_penalty = presence_penalty - - async def set_seed(self, seed: int): - logger.debug(f"Switching LLM seed to: [{seed}]") - self._seed = seed - - async def set_temperature(self, temperature: float): - logger.debug(f"Switching LLM temperature to: [{temperature}]") - self._temperature = temperature - - async def set_top_p(self, top_p: float): - logger.debug(f"Switching LLM top_p to: [{top_p}]") - self._top_p = top_p - - async def set_extra(self, extra: Dict[str, Any]): - logger.debug(f"Switching LLM extra to: [{extra}]") - self._extra = extra - async def get_chat_completions( self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam] ) -> AsyncStream[ChatCompletionChunk]: @@ -168,14 +147,14 @@ async def get_chat_completions( "tools": context.tools, "tool_choice": context.tool_choice, "stream_options": {"include_usage": True}, - "frequency_penalty": self._frequency_penalty, - "presence_penalty": self._presence_penalty, - "seed": self._seed, - "temperature": self._temperature, - "top_p": self._top_p, + "frequency_penalty": self._settings["frequency_penalty"], + "presence_penalty": self._settings["presence_penalty"], + "seed": self._settings["seed"], + "temperature": self._settings["temperature"], + "top_p": self._settings["top_p"], } - params.update(self._extra) + params.update(self._settings["extra"]) chunks = await self._client.chat.completions.create(**params) return chunks @@ -297,12 +276,11 @@ async def _process_context(self, context: OpenAILLMContext): async def _update_settings(self, settings: Dict[str, Any]): for key, value in settings.items(): - setter = getattr(self, f"set_{key}", None) - if setter and callable(setter): - try: - await setter(value) - except Exception as e: - logger.warning(f"Error setting {key}: {e}") + if key in self._settings: + logger.debug(f"Updating LLM setting {key} to: [{value}]") + self._settings[key] = value + if key == "model": + self.set_model_name(value) else: logger.warning(f"Unknown setting for OpenAI LLM service: {key}") @@ -316,7 +294,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): context = OpenAILLMContext.from_messages(frame.messages) elif isinstance(frame, VisionImageRawFrame): context = OpenAILLMContext.from_image_frame(frame) - elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm": + elif isinstance(frame, LLMUpdateSettingsFrame): await self._update_settings(frame.settings) else: await self.push_frame(frame, direction) @@ -421,22 +399,21 @@ def __init__( ): super().__init__(sample_rate=sample_rate, **kwargs) - self._voice: ValidVoice = VALID_VOICES.get(voice, "alloy") + self._settings = { + "voice": VALID_VOICES.get(voice, "alloy"), + "model": model, + "sample_rate": sample_rate, + } self.set_model_name(model) - self._sample_rate = sample_rate self._client = AsyncOpenAI(api_key=api_key) def can_generate_metrics(self) -> bool: return True - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice = VALID_VOICES.get(voice, self._voice) - async def set_model(self, model: str): logger.debug(f"Switching TTS model to: [{model}]") - self._model = model + self._settings["model"] = model async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") @@ -445,8 +422,8 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: async with self._client.audio.speech.with_streaming_response.create( input=text, - model=self.model_name, - voice=self._voice, + model=self._settings["model"], + voice=self._settings["voice"], response_format="pcm", ) as r: if r.status_code != 200: @@ -465,7 +442,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: async for chunk in r.iter_bytes(8192): if len(chunk) > 0: await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame(chunk, self.sample_rate, 1) + frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1) yield frame await self.push_frame(TTSStoppedFrame()) except BadRequestError as e: diff --git a/src/pipecat/services/playht.py b/src/pipecat/services/playht.py index 2ffa3a419..44ed9145e 100644 --- a/src/pipecat/services/playht.py +++ b/src/pipecat/services/playht.py @@ -6,17 +6,21 @@ import io import struct - from typing import AsyncGenerator -from pipecat.frames.frames import Frame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame -from pipecat.services.ai_services import TTSService - from loguru import logger +from pipecat.frames.frames import ( + Frame, + TTSAudioRawFrame, + TTSStartedFrame, + TTSStoppedFrame, +) +from pipecat.services.ai_services import TTSService + try: - from pyht.client import TTSOptions from pyht.async_client import AsyncClient + from pyht.client import TTSOptions from pyht.protos.api_pb2 import Format except ModuleNotFoundError as e: logger.error(f"Exception: {e}") @@ -39,17 +43,23 @@ def __init__( user_id=self._user_id, api_key=self._speech_key, ) + self._settings = { + "voice": voice_url, + "sample_rate": sample_rate, + "quality": "higher", + "format": Format.FORMAT_WAV, + "voice_engine": "PlayHT2.0-turbo", + } self._options = TTSOptions( - voice=voice_url, sample_rate=sample_rate, quality="higher", format=Format.FORMAT_WAV + voice=self._settings["voice"], + sample_rate=self._settings["sample_rate"], + quality=self._settings["quality"], + format=self._settings["format"], ) def can_generate_metrics(self) -> bool: return True - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._options.voice = voice - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") @@ -60,7 +70,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() playht_gen = self._client.tts( - text, voice_engine="PlayHT2.0-turbo", options=self._options + text, voice_engine=self._settings["voice_engine"], options=self._options ) await self.start_tts_usage_metrics(text) @@ -83,7 +93,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: else: if len(chunk): await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame(chunk, 16000, 1) + frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1) yield frame await self.push_frame(TTSStoppedFrame()) except Exception as e: diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py index 26a1a99fd..5fe11a179 100644 --- a/src/pipecat/services/together.py +++ b/src/pipecat/services/together.py @@ -50,13 +50,17 @@ def __init__( ): super().__init__(api_key=api_key, base_url=base_url, model=model, params=params, **kwargs) self.set_model_name(model) - self._max_tokens = params.max_tokens - self._frequency_penalty = params.frequency_penalty - self._presence_penalty = params.presence_penalty - self._temperature = params.temperature - self._top_k = params.top_k - self._top_p = params.top_p - self._extra = params.extra if isinstance(params.extra, dict) else {} + self._settings = { + "model": model, + "max_tokens": params.max_tokens, + "frequency_penalty": params.frequency_penalty, + "presence_penalty": params.presence_penalty, + "temperature": params.temperature, + "top_k": params.top_k, + "top_p": params.top_p, + "extra": params.extra if isinstance(params.extra, dict) else {}, + "seed": params.seed, + } def can_generate_metrics(self) -> bool: return True @@ -73,41 +77,12 @@ def create_client(self, api_key=None, base_url=None, **kwargs): ), ) - async def set_frequency_penalty(self, frequency_penalty: float): - logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]") - self._frequency_penalty = frequency_penalty - - async def set_max_tokens(self, max_tokens: int): - logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]") - self._max_tokens = max_tokens - - async def set_presence_penalty(self, presence_penalty: float): - logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]") - self._presence_penalty = presence_penalty - - async def set_temperature(self, temperature: float): - logger.debug(f"Switching LLM temperature to: [{temperature}]") - self._temperature = temperature - - async def set_top_k(self, top_k: float): - logger.debug(f"Switching LLM top_k to: [{top_k}]") - self._top_k = top_k - - async def set_top_p(self, top_p: float): - logger.debug(f"Switching LLM top_p to: [{top_p}]") - self._top_p = top_p - - async def set_extra(self, extra: Dict[str, Any]): - logger.debug(f"Switching LLM extra to: [{extra}]") - self._extra = extra - async def _update_settings(self, settings: Dict[str, Any]): for key, value in settings.items(): - setter = getattr(self, f"set_{key}", None) - if setter and callable(setter): - try: - await setter(value) - except Exception as e: - logger.warning(f"Error setting {key}: {e}") + if key in self._settings: + logger.debug(f"Updating LLM setting {key} to: [{value}]") + self._settings[key] = value + if key == "model": + self.set_model_name(value) else: logger.warning(f"Unknown setting for Together LLM service: {key}") diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py index 2c47d59e8..c833735d0 100644 --- a/src/pipecat/services/xtts.py +++ b/src/pipecat/services/xtts.py @@ -4,10 +4,12 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import aiohttp - from typing import Any, AsyncGenerator, Dict +import aiohttp +import numpy as np +from loguru import logger + from pipecat.frames.frames import ( ErrorFrame, Frame, @@ -18,10 +20,6 @@ ) from pipecat.services.ai_services import TTSService -import numpy as np - -from loguru import logger - try: import resampy except ModuleNotFoundError as e: @@ -50,9 +48,11 @@ def __init__( ): super().__init__(**kwargs) - self._voice_id = voice_id - self._language = language - self._base_url = base_url + self._settings = { + "voice_id": voice_id, + "language": language, + "base_url": base_url, + } self._studio_speakers: Dict[str, Any] | None = None self._aiohttp_session = aiohttp_session @@ -61,7 +61,7 @@ def can_generate_metrics(self) -> bool: async def start(self, frame: StartFrame): await super().start(frame) - async with self._aiohttp_session.get(self._base_url + "/studio_speakers") as r: + async with self._aiohttp_session.get(self._settings["base_url"] + "/studio_speakers") as r: if r.status != 200: text = await r.text() logger.error( @@ -75,10 +75,6 @@ async def start(self, frame: StartFrame): return self._studio_speakers = await r.json() - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") @@ -86,13 +82,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.error(f"{self} no studio speakers available") return - embeddings = self._studio_speakers[self._voice_id] + embeddings = self._studio_speakers[self._settings["voice_id"]] - url = self._base_url + "/tts_stream" + url = self._settings["base_url"] + "/tts_stream" payload = { "text": text.replace(".", "").replace("*", ""), - "language": self._language, + "language": self._settings["language"], "speaker_embedding": embeddings["speaker_embedding"], "gpt_cond_latent": embeddings["gpt_cond_latent"], "add_wav_header": False,