From a397b859fec9e94ec7248cc2469366564e09e9c4 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Mon, 30 Sep 2024 15:47:41 -0400 Subject: [PATCH 1/5] Add support for gender and google_style inputs to Google TTS --- src/pipecat/frames/frames.py | 2 ++ src/pipecat/services/ai_services.py | 8 ++++++++ src/pipecat/services/google.py | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index f7faa8ef0..584be32db 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -558,6 +558,8 @@ class TTSUpdateSettingsFrame(ControlFrame): style: Optional[str] = None style_degree: Optional[str] = None role: Optional[str] = None + gender: Optional[str] = None + google_style: Optional[str] = None @dataclass diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index a329239dc..344531de3 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -235,6 +235,14 @@ async def set_role(self, role: str): async def flush_audio(self): pass + @abstractmethod + async def set_gender(self, gender: str): + pass + + @abstractmethod + async def set_google_style(self, google_style: str): + pass + # Converts the text to audio. @abstractmethod async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index 519f47028..046b465b4 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -272,7 +272,7 @@ async def set_gender(self, gender: Literal["male", "female", "neutral"]) -> None logger.debug(f"Switch TTS gender to [{gender}]") self._params.gender = gender - async def google_style( + async def set_google_style( self, google_style: Literal["apologetic", "calm", "empathetic", "firm", "lively"] ) -> None: logger.debug(f"Switching TTS google style to: [{google_style}]") From 88cca7bf68ba742935aeaee6e2e14af3068850a1 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 1 Oct 2024 10:35:59 -0400 Subject: [PATCH 2/5] Consolidate service UpdateSettingsFrame into a single ServiceUpdateSettingsFrame --- CHANGELOG.md | 4 +- src/pipecat/frames/frames.py | 46 ++-------------- src/pipecat/services/ai_services.py | 85 ++++++++++++++--------------- src/pipecat/services/anthropic.py | 30 +++++----- src/pipecat/services/google.py | 25 +++++++-- src/pipecat/services/openai.py | 32 +++++------ src/pipecat/services/together.py | 33 ++++------- 7 files changed, 105 insertions(+), 150 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b59ed56c8..922a21f21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -86,8 +86,8 @@ async def on_connected(processor): ### Changed -- Updated individual update settings frame classes into a single UpdateSettingsFrame - class for STT, LLM, and TTS. +- Updated individual update settings frame classes into a single + ServiceUpdateSettingsFrame class. - We now distinguish between input and output audio and image frames. We introduce `InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame` diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 584be32db..7cb93606d 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -5,7 +5,7 @@ # from dataclasses import dataclass, field -from typing import Any, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple from pipecat.clocks.base_clock import BaseClock from pipecat.metrics.metrics import MetricsData @@ -527,47 +527,11 @@ def __str__(self): @dataclass -class LLMUpdateSettingsFrame(ControlFrame): - """A control frame containing a request to update LLM settings.""" +class ServiceUpdateSettingsFrame(ControlFrame): + """A control frame containing a request to update service settings.""" - model: Optional[str] = None - temperature: Optional[float] = None - top_k: Optional[int] = None - top_p: Optional[float] = None - frequency_penalty: Optional[float] = None - presence_penalty: Optional[float] = None - max_tokens: Optional[int] = None - seed: Optional[int] = None - extra: dict = field(default_factory=dict) - - -@dataclass -class TTSUpdateSettingsFrame(ControlFrame): - """A control frame containing a request to update TTS settings.""" - - model: Optional[str] = None - voice: Optional[str] = None - language: Optional[Language] = None - speed: Optional[Union[str, float]] = None - emotion: Optional[List[str]] = None - engine: Optional[str] = None - pitch: Optional[str] = None - rate: Optional[str] = None - volume: Optional[str] = None - emphasis: Optional[str] = None - style: Optional[str] = None - style_degree: Optional[str] = None - role: Optional[str] = None - gender: Optional[str] = None - google_style: Optional[str] = None - - -@dataclass -class STTUpdateSettingsFrame(ControlFrame): - """A control frame containing a request to update STT settings.""" - - model: Optional[str] = None - language: Optional[Language] = None + service_type: str + settings: Dict[str, Any] @dataclass diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 344531de3..6ddc4d426 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -8,7 +8,7 @@ import io import wave from abc import abstractmethod -from typing import AsyncGenerator, List, Optional, Tuple, Union +from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union from loguru import logger @@ -19,15 +19,14 @@ ErrorFrame, Frame, LLMFullResponseEndFrame, + ServiceUpdateSettingsFrame, StartFrame, StartInterruptionFrame, - STTUpdateSettingsFrame, TextFrame, TTSAudioRawFrame, TTSSpeakFrame, TTSStartedFrame, TTSStoppedFrame, - TTSUpdateSettingsFrame, UserImageRequestFrame, VisionImageRawFrame, ) @@ -169,6 +168,7 @@ def __init__( self._push_stop_frames: bool = push_stop_frames self._stop_frame_timeout_s: float = stop_frame_timeout_s self._sample_rate: int = sample_rate + self._settings: Dict[str, Any] = {} self._stop_frame_task: Optional[asyncio.Task] = None self._stop_frame_queue: asyncio.Queue = asyncio.Queue() @@ -232,15 +232,15 @@ async def set_role(self, role: str): pass @abstractmethod - async def flush_audio(self): + async def set_gender(self, gender: str): pass @abstractmethod - async def set_gender(self, gender: str): + async def set_google_style(self, google_style: str): pass @abstractmethod - async def set_google_style(self, google_style: str): + async def flush_audio(self): pass # Converts the text to audio. @@ -267,6 +267,22 @@ async def cancel(self, frame: CancelFrame): await self._stop_frame_task self._stop_frame_task = None + async def _update_settings(self, settings: Dict[str, Any]): + for key, value in settings.items(): + setter = getattr(self, f"set_{key}", None) + if setter and callable(setter): + try: + if key == "language": + await setter(Language(value)) + else: + await setter(value) + except Exception as e: + logger.warning(f"Error setting {key}: {e}") + else: + logger.warning(f"Unknown setting for TTS service: {key}") + + self._settings.update(settings) + async def say(self, text: str): aggregate_sentences = self._aggregate_sentences self._aggregate_sentences = False @@ -293,8 +309,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): elif isinstance(frame, TTSSpeakFrame): await self._push_tts_frames(frame.text) await self.flush_audio() - elif isinstance(frame, TTSUpdateSettingsFrame): - await self._update_tts_settings(frame) + elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "tts": + await self._update_settings(frame.settings) else: await self.push_frame(frame, direction) @@ -341,34 +357,6 @@ async def _push_tts_frames(self, text: str): # interrupted, the text is not added to the assistant context. await self.push_frame(TextFrame(text)) - async def _update_tts_settings(self, frame: TTSUpdateSettingsFrame): - if frame.model is not None: - await self.set_model(frame.model) - if frame.voice is not None: - await self.set_voice(frame.voice) - if frame.language is not None: - await self.set_language(frame.language) - if frame.speed is not None: - await self.set_speed(frame.speed) - if frame.emotion is not None: - await self.set_emotion(frame.emotion) - if frame.engine is not None: - await self.set_engine(frame.engine) - if frame.pitch is not None: - await self.set_pitch(frame.pitch) - if frame.rate is not None: - await self.set_rate(frame.rate) - if frame.volume is not None: - await self.set_volume(frame.volume) - if frame.emphasis is not None: - await self.set_emphasis(frame.emphasis) - if frame.style is not None: - await self.set_style(frame.style) - if frame.style_degree is not None: - await self.set_style_degree(frame.style_degree) - if frame.role is not None: - await self.set_role(frame.role) - async def _stop_frame_handler(self): try: has_started = False @@ -454,6 +442,7 @@ class STTService(AIService): def __init__(self, **kwargs): super().__init__(**kwargs) + self._settings: Dict[str, Any] = {} @abstractmethod async def set_model(self, model: str): @@ -468,11 +457,21 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: """Returns transcript as a string""" pass - async def _update_stt_settings(self, frame: STTUpdateSettingsFrame): - if frame.model is not None: - await self.set_model(frame.model) - if frame.language is not None: - await self.set_language(frame.language) + async def _update_settings(self, settings: Dict[str, Any]): + for key, value in settings.items(): + setter = getattr(self, f"set_{key}", None) + if setter and callable(setter): + try: + if key == "language": + await setter(Language(value)) + else: + await setter(value) + except Exception as e: + logger.warning(f"Error setting {key}: {e}") + else: + logger.warning(f"Unknown setting for STT service: {key}") + + self._settings.update(settings) async def process_audio_frame(self, frame: AudioRawFrame): await self.process_generator(self.run_stt(frame.audio)) @@ -485,8 +484,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # In this service we accumulate audio internally and at the end we # push a TextFrame. We don't really want to push audio frames down. await self.process_audio_frame(frame) - elif isinstance(frame, STTUpdateSettingsFrame): - await self._update_stt_settings(frame) + elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "stt": + await self._update_settings(frame.settings) else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py index 639a922e6..09d6c5402 100644 --- a/src/pipecat/services/anthropic.py +++ b/src/pipecat/services/anthropic.py @@ -25,7 +25,7 @@ LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesFrame, - LLMUpdateSettingsFrame, + ServiceUpdateSettingsFrame, StartInterruptionFrame, TextFrame, UserImageRawFrame, @@ -284,20 +284,16 @@ async def _process_context(self, context: OpenAILLMContext): cache_read_input_tokens=cache_read_input_tokens, ) - async def _update_settings(self, frame: LLMUpdateSettingsFrame): - if frame.model is not None: - logger.debug(f"Switching LLM model to: [{frame.model}]") - self.set_model_name(frame.model) - if frame.max_tokens is not None: - await self.set_max_tokens(frame.max_tokens) - if frame.temperature is not None: - await self.set_temperature(frame.temperature) - if frame.top_k is not None: - await self.set_top_k(frame.top_k) - if frame.top_p is not None: - await self.set_top_p(frame.top_p) - if frame.extra: - await self.set_extra(frame.extra) + async def _update_settings(self, settings: Dict[str, Any]): + for key, value in settings.items(): + setter = getattr(self, f"set_{key}", None) + if setter and callable(setter): + try: + await setter(value) + except Exception as e: + logger.warning(f"Error setting {key}: {e}") + else: + logger.warning(f"Unknown setting for Anthropic LLM service: {key}") async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -313,8 +309,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # UserImageRawFrames coming through the pipeline and add them # to the context. context = AnthropicLLMContext.from_image_frame(frame) - elif isinstance(frame, LLMUpdateSettingsFrame): - await self._update_settings(frame) + elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm": + await self._update_settings(frame.settings) elif isinstance(frame, LLMEnablePromptCachingFrame): logger.debug(f"Setting enable prompt caching to: [{frame.enable}]") self._enable_prompt_caching_beta = frame.enable diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index 046b465b4..092b4703e 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -6,7 +6,7 @@ import asyncio import json -from typing import AsyncGenerator, List, Literal, Optional +from typing import Any, AsyncGenerator, Dict, List, Literal, Optional from loguru import logger from pydantic import BaseModel @@ -17,7 +17,7 @@ LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesFrame, - LLMUpdateSettingsFrame, + ServiceUpdateSettingsFrame, TextFrame, TTSAudioRawFrame, TTSStartedFrame, @@ -64,6 +64,21 @@ def _create_client(self, model: str): self.set_model_name(model) self._client = gai.GenerativeModel(model) + async def set_model(self, model: str): + logger.debug(f"Switching LLM model to: [{model}]") + self._create_client(model) + + async def _update_settings(self, settings: Dict[str, Any]): + for key, value in settings.items(): + setter = getattr(self, f"set_{key}", None) + if setter and callable(setter): + try: + await setter(value) + except Exception as e: + logger.warning(f"Error setting {key}: {e}") + else: + logger.warning(f"Unknown setting for Google LLM service: {key}") + def _get_messages_from_openai_context(self, context: OpenAILLMContext) -> List[glm.Content]: openai_messages = context.get_messages() google_messages = [] @@ -136,10 +151,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): context = OpenAILLMContext.from_messages(frame.messages) elif isinstance(frame, VisionImageRawFrame): context = OpenAILLMContext.from_image_frame(frame) - elif isinstance(frame, LLMUpdateSettingsFrame): - if frame.model is not None: - logger.debug(f"Switching LLM model to: [{frame.model}]") - self.set_model_name(frame.model) + elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm": + await self._update_settings(frame.settings) else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index 9a7cc9023..57c1b3399 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -24,7 +24,7 @@ LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesFrame, - LLMUpdateSettingsFrame, + ServiceUpdateSettingsFrame, StartInterruptionFrame, TextFrame, TTSAudioRawFrame, @@ -295,22 +295,16 @@ async def _process_context(self, context: OpenAILLMContext): f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function." ) - async def _update_settings(self, frame: LLMUpdateSettingsFrame): - if frame.model is not None: - logger.debug(f"Switching LLM model to: [{frame.model}]") - self.set_model_name(frame.model) - if frame.frequency_penalty is not None: - await self.set_frequency_penalty(frame.frequency_penalty) - if frame.presence_penalty is not None: - await self.set_presence_penalty(frame.presence_penalty) - if frame.seed is not None: - await self.set_seed(frame.seed) - if frame.temperature is not None: - await self.set_temperature(frame.temperature) - if frame.top_p is not None: - await self.set_top_p(frame.top_p) - if frame.extra: - await self.set_extra(frame.extra) + async def _update_settings(self, settings: Dict[str, Any]): + for key, value in settings.items(): + setter = getattr(self, f"set_{key}", None) + if setter and callable(setter): + try: + await setter(value) + except Exception as e: + logger.warning(f"Error setting {key}: {e}") + else: + logger.warning(f"Unknown setting for OpenAI LLM service: {key}") async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -322,8 +316,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): context = OpenAILLMContext.from_messages(frame.messages) elif isinstance(frame, VisionImageRawFrame): context = OpenAILLMContext.from_image_frame(frame) - elif isinstance(frame, LLMUpdateSettingsFrame): - await self._update_settings(frame) + elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm": + await self._update_settings(frame.settings) else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py index 5da470002..26a1a99fd 100644 --- a/src/pipecat/services/together.py +++ b/src/pipecat/services/together.py @@ -5,16 +5,13 @@ # from typing import Any, Dict, Optional + import httpx from loguru import logger from pydantic import BaseModel, Field -from pipecat.frames.frames import ( - LLMUpdateSettingsFrame, -) from pipecat.services.openai import OpenAILLMService - try: # Together.ai is recommending OpenAI-compatible function calling, so we've switched over # to using the OpenAI client library here rather than the Together Python client library. @@ -104,21 +101,13 @@ async def set_extra(self, extra: Dict[str, Any]): logger.debug(f"Switching LLM extra to: [{extra}]") self._extra = extra - async def _update_settings(self, frame: LLMUpdateSettingsFrame): - if frame.model is not None: - logger.debug(f"Switching LLM model to: [{frame.model}]") - self.set_model_name(frame.model) - if frame.frequency_penalty is not None: - await self.set_frequency_penalty(frame.frequency_penalty) - if frame.max_tokens is not None: - await self.set_max_tokens(frame.max_tokens) - if frame.presence_penalty is not None: - await self.set_presence_penalty(frame.presence_penalty) - if frame.temperature is not None: - await self.set_temperature(frame.temperature) - if frame.top_k is not None: - await self.set_top_k(frame.top_k) - if frame.top_p is not None: - await self.set_top_p(frame.top_p) - if frame.extra: - await self.set_extra(frame.extra) + async def _update_settings(self, settings: Dict[str, Any]): + for key, value in settings.items(): + setter = getattr(self, f"set_{key}", None) + if setter and callable(setter): + try: + await setter(value) + except Exception as e: + logger.warning(f"Error setting {key}: {e}") + else: + logger.warning(f"Unknown setting for Together LLM service: {key}") From 28643b453d78c0d71e040e52011f388e7eaad729 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 1 Oct 2024 14:37:07 -0400 Subject: [PATCH 3/5] Update to use LLM, STT, TTS subclasses and remove setter methods --- .../07c-interruptible-deepgram.py | 2 +- .../foundational/07e-interruptible-playht.py | 16 +- .../07g-interruptible-openai-tts.py | 17 +-- .../16-gpu-container-local-bot.py | 14 +- src/pipecat/frames/frames.py | 16 +- src/pipecat/services/ai_services.py | 120 +++++---------- src/pipecat/services/anthropic.py | 69 +++------ src/pipecat/services/aws.py | 70 +++------ src/pipecat/services/azure.py | 124 +++++----------- src/pipecat/services/cartesia.py | 137 +++++++----------- src/pipecat/services/deepgram.py | 43 +++--- src/pipecat/services/elevenlabs.py | 87 +++++------ src/pipecat/services/gladia.py | 19 ++- src/pipecat/services/google.py | 112 +++++--------- src/pipecat/services/lmnt.py | 32 ++-- src/pipecat/services/openai.py | 83 +++-------- src/pipecat/services/playht.py | 36 +++-- src/pipecat/services/together.py | 55 ++----- src/pipecat/services/xtts.py | 28 ++-- 19 files changed, 395 insertions(+), 685 deletions(-) diff --git a/examples/foundational/07c-interruptible-deepgram.py b/examples/foundational/07c-interruptible-deepgram.py index fc33c246f..f3b4ee246 100644 --- a/examples/foundational/07c-interruptible-deepgram.py +++ b/examples/foundational/07c-interruptible-deepgram.py @@ -50,7 +50,7 @@ async def main(): stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) - tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en") + tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice_id="aura-helios-en") llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") diff --git a/examples/foundational/07e-interruptible-playht.py b/examples/foundational/07e-interruptible-playht.py index 9c48df93a..58d85ed79 100644 --- a/examples/foundational/07e-interruptible-playht.py +++ b/examples/foundational/07e-interruptible-playht.py @@ -4,11 +4,15 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import aiohttp import asyncio import os import sys +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -17,17 +21,11 @@ LLMAssistantResponseAggregator, LLMUserResponseAggregator, ) -from pipecat.services.playht import PlayHTTTSService from pipecat.services.openai import OpenAILLMService +from pipecat.services.playht import PlayHTTTSService from pipecat.transports.services.daily import DailyParams, DailyTransport from pipecat.vad.silero import SileroVADAnalyzer -from runner import configure - -from loguru import logger - -from dotenv import load_dotenv - load_dotenv(override=True) logger.remove(0) @@ -54,7 +52,7 @@ async def main(): tts = PlayHTTTSService( user_id=os.getenv("PLAYHT_USER_ID"), api_key=os.getenv("PLAYHT_API_KEY"), - voice_url="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json", + voice_id="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json", ) llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") diff --git a/examples/foundational/07g-interruptible-openai-tts.py b/examples/foundational/07g-interruptible-openai-tts.py index 70576c97a..b7671c42f 100644 --- a/examples/foundational/07g-interruptible-openai-tts.py +++ b/examples/foundational/07g-interruptible-openai-tts.py @@ -4,11 +4,15 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import aiohttp import asyncio import os import sys +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -17,17 +21,10 @@ LLMAssistantResponseAggregator, LLMUserResponseAggregator, ) -from pipecat.services.openai import OpenAITTSService -from pipecat.services.openai import OpenAILLMService +from pipecat.services.openai import OpenAILLMService, OpenAITTSService from pipecat.transports.services.daily import DailyParams, DailyTransport from pipecat.vad.silero import SileroVADAnalyzer -from runner import configure - -from loguru import logger - -from dotenv import load_dotenv - load_dotenv(override=True) logger.remove(0) @@ -51,7 +48,7 @@ async def main(): ), ) - tts = OpenAITTSService(api_key=os.getenv("OPENAI_API_KEY"), voice="alloy") + tts = OpenAITTSService(api_key=os.getenv("OPENAI_API_KEY"), voice_id="alloy") llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") diff --git a/examples/foundational/16-gpu-container-local-bot.py b/examples/foundational/16-gpu-container-local-bot.py index 06bf45195..b3ae9686e 100644 --- a/examples/foundational/16-gpu-container-local-bot.py +++ b/examples/foundational/16-gpu-container-local-bot.py @@ -5,10 +5,14 @@ # import asyncio -import aiohttp import os import sys +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -26,12 +30,6 @@ ) from pipecat.vad.silero import SileroVADAnalyzer -from runner import configure - -from loguru import logger - -from dotenv import load_dotenv - load_dotenv(override=True) logger.remove(0) @@ -57,7 +55,7 @@ async def main(): tts = DeepgramTTSService( aiohttp_session=session, api_key=os.getenv("DEEPGRAM_API_KEY"), - voice="aura-asteria-en", + voice_id="aura-asteria-en", base_url="http://0.0.0.0:8080/v1/speak", ) diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 7cb93606d..e2ef78df5 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -530,10 +530,24 @@ def __str__(self): class ServiceUpdateSettingsFrame(ControlFrame): """A control frame containing a request to update service settings.""" - service_type: str settings: Dict[str, Any] +@dataclass +class LLMUpdateSettingsFrame(ServiceUpdateSettingsFrame): + pass + + +@dataclass +class TTSUpdateSettingsFrame(ServiceUpdateSettingsFrame): + pass + + +@dataclass +class STTUpdateSettingsFrame(ServiceUpdateSettingsFrame): + pass + + @dataclass class FunctionCallInProgressFrame(SystemFrame): """A frame signaling that a function call is in progress.""" diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 6ddc4d426..64d79b582 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -8,7 +8,7 @@ import io import wave from abc import abstractmethod -from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union +from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple from loguru import logger @@ -19,14 +19,15 @@ ErrorFrame, Frame, LLMFullResponseEndFrame, - ServiceUpdateSettingsFrame, StartFrame, StartInterruptionFrame, + STTUpdateSettingsFrame, TextFrame, TTSAudioRawFrame, TTSSpeakFrame, TTSStartedFrame, TTSStoppedFrame, + TTSUpdateSettingsFrame, UserImageRequestFrame, VisionImageRawFrame, ) @@ -44,6 +45,7 @@ class AIService(FrameProcessor): def __init__(self, **kwargs): super().__init__(**kwargs) self._model_name: str = "" + self._settings: Dict[str, Any] = {} @property def model_name(self) -> str: @@ -62,6 +64,16 @@ async def stop(self, frame: EndFrame): async def cancel(self, frame: CancelFrame): pass + async def _update_settings(self, settings: Dict[str, Any]): + for key, value in settings.items(): + if key in self._settings: + logger.debug(f"Updating setting {key} to: [{value}] for {self.name}") + self._settings[key] = value + elif key == "model": + self.set_model_name(value) + else: + logger.warning(f"Unknown setting for {self.name} service: {key}") + async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -168,6 +180,7 @@ def __init__( self._push_stop_frames: bool = push_stop_frames self._stop_frame_timeout_s: float = stop_frame_timeout_s self._sample_rate: int = sample_rate + self._voice_id: str = "" self._settings: Dict[str, Any] = {} self._stop_frame_task: Optional[asyncio.Task] = None @@ -184,60 +197,8 @@ async def set_model(self, model: str): self.set_model_name(model) @abstractmethod - async def set_voice(self, voice: str): - pass - - @abstractmethod - async def set_language(self, language: Language): - pass - - @abstractmethod - async def set_speed(self, speed: Union[str, float]): - pass - - @abstractmethod - async def set_emotion(self, emotion: List[str]): - pass - - @abstractmethod - async def set_engine(self, engine: str): - pass - - @abstractmethod - async def set_pitch(self, pitch: str): - pass - - @abstractmethod - async def set_rate(self, rate: str): - pass - - @abstractmethod - async def set_volume(self, volume: str): - pass - - @abstractmethod - async def set_emphasis(self, emphasis: str): - pass - - @abstractmethod - async def set_style(self, style: str): - pass - - @abstractmethod - async def set_style_degree(self, style_degree: str): - pass - - @abstractmethod - async def set_role(self, role: str): - pass - - @abstractmethod - async def set_gender(self, gender: str): - pass - - @abstractmethod - async def set_google_style(self, google_style: str): - pass + def set_voice(self, voice: str): + self._voice_id = voice @abstractmethod async def flush_audio(self): @@ -269,20 +230,18 @@ async def cancel(self, frame: CancelFrame): async def _update_settings(self, settings: Dict[str, Any]): for key, value in settings.items(): - setter = getattr(self, f"set_{key}", None) - if setter and callable(setter): - try: - if key == "language": - await setter(Language(value)) - else: - await setter(value) - except Exception as e: - logger.warning(f"Error setting {key}: {e}") + if key in self._settings: + logger.debug(f"Updating TTS setting {key} to: [{value}]") + self._settings[key] = value + if key == "language": + self._settings[key] = Language(value) + elif key == "model": + self.set_model_name(value) + elif key == "voice": + self.set_voice(value) else: logger.warning(f"Unknown setting for TTS service: {key}") - self._settings.update(settings) - async def say(self, text: str): aggregate_sentences = self._aggregate_sentences self._aggregate_sentences = False @@ -309,7 +268,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): elif isinstance(frame, TTSSpeakFrame): await self._push_tts_frames(frame.text) await self.flush_audio() - elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "tts": + elif isinstance(frame, TTSUpdateSettingsFrame): await self._update_settings(frame.settings) else: await self.push_frame(frame, direction) @@ -448,31 +407,24 @@ def __init__(self, **kwargs): async def set_model(self, model: str): self.set_model_name(model) - @abstractmethod - async def set_language(self, language: Language): - pass - @abstractmethod async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: """Returns transcript as a string""" pass async def _update_settings(self, settings: Dict[str, Any]): + logger.debug(f"Updating STT settings: {self._settings}") for key, value in settings.items(): - setter = getattr(self, f"set_{key}", None) - if setter and callable(setter): - try: - if key == "language": - await setter(Language(value)) - else: - await setter(value) - except Exception as e: - logger.warning(f"Error setting {key}: {e}") + if key in self._settings: + logger.debug(f"Updating STT setting {key} to: [{value}]") + self._settings[key] = value + if key == "language": + self._settings[key] = Language(value) + elif key == "model": + self.set_model_name(value) else: logger.warning(f"Unknown setting for STT service: {key}") - self._settings.update(settings) - async def process_audio_frame(self, frame: AudioRawFrame): await self.process_generator(self.run_stt(frame.audio)) @@ -484,7 +436,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # In this service we accumulate audio internally and at the end we # push a TextFrame. We don't really want to push audio frames down. await self.process_audio_frame(frame) - elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "stt": + elif isinstance(frame, STTUpdateSettingsFrame): await self._update_settings(frame.settings) else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py index 09d6c5402..1b7064209 100644 --- a/src/pipecat/services/anthropic.py +++ b/src/pipecat/services/anthropic.py @@ -25,7 +25,7 @@ LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesFrame, - ServiceUpdateSettingsFrame, + LLMUpdateSettingsFrame, StartInterruptionFrame, TextFrame, UserImageRawFrame, @@ -96,12 +96,14 @@ def __init__( super().__init__(**kwargs) self._client = AsyncAnthropic(api_key=api_key) self.set_model_name(model) - self._max_tokens = params.max_tokens - self._enable_prompt_caching_beta: bool = params.enable_prompt_caching_beta or False - self._temperature = params.temperature - self._top_k = params.top_k - self._top_p = params.top_p - self._extra = params.extra if isinstance(params.extra, dict) else {} + self._settings = { + "max_tokens": params.max_tokens, + "enable_prompt_caching_beta": params.enable_prompt_caching_beta or False, + "temperature": params.temperature, + "top_k": params.top_k, + "top_p": params.top_p, + "extra": params.extra if isinstance(params.extra, dict) else {}, + } def can_generate_metrics(self) -> bool: return True @@ -120,30 +122,6 @@ def create_context_aggregator( ) return AnthropicContextAggregatorPair(_user=user, _assistant=assistant) - async def set_enable_prompt_caching_beta(self, enable_prompt_caching_beta: bool): - logger.debug(f"Switching LLM enable_prompt_caching_beta to: [{enable_prompt_caching_beta}]") - self._enable_prompt_caching_beta = enable_prompt_caching_beta - - async def set_max_tokens(self, max_tokens: int): - logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]") - self._max_tokens = max_tokens - - async def set_temperature(self, temperature: float): - logger.debug(f"Switching LLM temperature to: [{temperature}]") - self._temperature = temperature - - async def set_top_k(self, top_k: float): - logger.debug(f"Switching LLM top_k to: [{top_k}]") - self._top_k = top_k - - async def set_top_p(self, top_p: float): - logger.debug(f"Switching LLM top_p to: [{top_p}]") - self._top_p = top_p - - async def set_extra(self, extra: Dict[str, Any]): - logger.debug(f"Switching LLM extra to: [{extra}]") - self._extra = extra - async def _process_context(self, context: OpenAILLMContext): # Usage tracking. We track the usage reported by Anthropic in prompt_tokens and # completion_tokens. We also estimate the completion tokens from output text @@ -165,11 +143,11 @@ async def _process_context(self, context: OpenAILLMContext): ) messages = context.messages - if self._enable_prompt_caching_beta: + if self._settings["enable_prompt_caching_beta"]: messages = context.get_messages_with_cache_control_markers() api_call = self._client.messages.create - if self._enable_prompt_caching_beta: + if self._settings["enable_prompt_caching_beta"]: api_call = self._client.beta.prompt_caching.messages.create await self.start_ttfb_metrics() @@ -179,14 +157,14 @@ async def _process_context(self, context: OpenAILLMContext): "system": context.system, "messages": messages, "model": self.model_name, - "max_tokens": self._max_tokens, + "max_tokens": self._settings["max_tokens"], "stream": True, - "temperature": self._temperature, - "top_k": self._top_k, - "top_p": self._top_p, + "temperature": self._settings["temperature"], + "top_k": self._settings["top_k"], + "top_p": self._settings["top_p"], } - params.update(self._extra) + params.update(self._settings["extra"]) response = await api_call(**params) @@ -284,17 +262,6 @@ async def _process_context(self, context: OpenAILLMContext): cache_read_input_tokens=cache_read_input_tokens, ) - async def _update_settings(self, settings: Dict[str, Any]): - for key, value in settings.items(): - setter = getattr(self, f"set_{key}", None) - if setter and callable(setter): - try: - await setter(value) - except Exception as e: - logger.warning(f"Error setting {key}: {e}") - else: - logger.warning(f"Unknown setting for Anthropic LLM service: {key}") - async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -309,11 +276,11 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): # UserImageRawFrames coming through the pipeline and add them # to the context. context = AnthropicLLMContext.from_image_frame(frame) - elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm": + elif isinstance(frame, LLMUpdateSettingsFrame): await self._update_settings(frame.settings) elif isinstance(frame, LLMEnablePromptCachingFrame): logger.debug(f"Setting enable prompt caching to: [{frame.enable}]") - self._enable_prompt_caching_beta = frame.enable + self._settings["enable_prompt_caching_beta"] = frame.enable else: await self.push_frame(frame, direction) diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py index 80240985f..7004c21f7 100644 --- a/src/pipecat/services/aws.py +++ b/src/pipecat/services/aws.py @@ -6,6 +6,7 @@ from typing import AsyncGenerator, Optional +from loguru import logger from pydantic import BaseModel from pipecat.frames.frames import ( @@ -17,8 +18,6 @@ ) from pipecat.services.ai_services import TTSService -from loguru import logger - try: import boto3 from botocore.exceptions import BotoCoreError, ClientError @@ -57,9 +56,16 @@ def __init__( aws_secret_access_key=api_key, region_name=region, ) - self._voice_id = voice_id - self._sample_rate = sample_rate - self._params = params + self._settings = { + "sample_rate": sample_rate, + "engine": params.engine, + "language": params.language, + "pitch": params.pitch, + "rate": params.rate, + "volume": params.volume, + } + + self.set_voice(voice_id) def can_generate_metrics(self) -> bool: return True @@ -67,18 +73,18 @@ def can_generate_metrics(self) -> bool: def _construct_ssml(self, text: str) -> str: ssml = "" - if self._params.language: - ssml += f"" + if self._settings["language"]: + ssml += f"" prosody_attrs = [] # Prosody tags are only supported for standard and neural engines - if self._params.engine != "generative": - if self._params.rate: - prosody_attrs.append(f"rate='{self._params.rate}'") - if self._params.pitch: - prosody_attrs.append(f"pitch='{self._params.pitch}'") - if self._params.volume: - prosody_attrs.append(f"volume='{self._params.volume}'") + if self._settings["engine"] != "generative": + if self._settings["rate"]: + prosody_attrs.append(f"rate='{self._settings["rate"]}'") + if self._settings["pitch"]: + prosody_attrs.append(f"pitch='{self._settings["pitch"]}'") + if self._settings["volume"]: + prosody_attrs.append(f"volume='{self._settings["volume"]}'") if prosody_attrs: ssml += f"" @@ -90,41 +96,13 @@ def _construct_ssml(self, text: str) -> str: if prosody_attrs: ssml += "" - if self._params.language: + if self._settings["language"]: ssml += "" ssml += "" return ssml - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - - async def set_engine(self, engine: str): - logger.debug(f"Switching TTS engine to: [{engine}]") - self._params.engine = engine - - async def set_language(self, language: str): - logger.debug(f"Switching TTS language to: [{language}]") - self._params.language = language - - async def set_pitch(self, pitch: str): - logger.debug(f"Switching TTS pitch to: [{pitch}]") - self._params.pitch = pitch - - async def set_rate(self, rate: str): - logger.debug(f"Switching TTS rate to: [{rate}]") - self._params.rate = rate - - async def set_volume(self, volume: str): - logger.debug(f"Switching TTS volume to: [{volume}]") - self._params.volume = volume - - async def set_params(self, params: InputParams): - logger.debug(f"Switching TTS params to: [{params}]") - self._params = params - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") @@ -139,8 +117,8 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: "TextType": "ssml", "OutputFormat": "pcm", "VoiceId": self._voice_id, - "Engine": self._params.engine, - "SampleRate": str(self._sample_rate), + "Engine": self._settings["engine"], + "SampleRate": str(self._settings["sample_rate"]), } # Filter out None values @@ -160,7 +138,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: chunk = audio_data[i : i + chunk_size] if len(chunk) > 0: await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame(chunk, self._sample_rate, 1) + frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1) yield frame await self.push_frame(TTSStoppedFrame()) diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index a1349cefe..1b2b9a3f2 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -4,12 +4,13 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import aiohttp import asyncio import io - from typing import AsyncGenerator, Optional +import aiohttp +from loguru import logger +from PIL import Image from pydantic import BaseModel from pipecat.frames.frames import ( @@ -28,10 +29,6 @@ from pipecat.services.openai import BaseOpenAILLMService from pipecat.utils.time import time_now_iso8601 -from PIL import Image - -from loguru import logger - # See .env.example for Azure configuration needed try: from azure.cognitiveservices.speech import ( @@ -89,7 +86,7 @@ def __init__( *, api_key: str, region: str, - voice="en-US-SaraNeural", + voice_id="en-US-SaraNeural", sample_rate: int = 16000, params: InputParams = InputParams(), **kwargs, @@ -99,114 +96,67 @@ def __init__( speech_config = SpeechConfig(subscription=api_key, region=region) self._speech_synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None) - self._voice = voice - self._sample_rate = sample_rate - self._params = params + self._settings = { + "sample_rate": sample_rate, + "emphasis": params.emphasis, + "language": params.language, + "pitch": params.pitch, + "rate": params.rate, + "role": params.role, + "style": params.style, + "style_degree": params.style_degree, + "volume": params.volume, + } + + self.set_voice(voice_id) def can_generate_metrics(self) -> bool: return True def _construct_ssml(self, text: str) -> str: ssml = ( - f"" - f"" + f"" "" ) - if self._params.style: - ssml += f"" - if self._params.emphasis: - ssml += f"" + if self._settings["emphasis"]: + ssml += f"" ssml += text - if self._params.emphasis: + if self._settings["emphasis"]: ssml += "" ssml += "" - if self._params.style: + if self._settings["style"]: ssml += "" ssml += "" return ssml - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice = voice - - async def set_emphasis(self, emphasis: str): - logger.debug(f"Setting TTS emphasis to: [{emphasis}]") - self._params.emphasis = emphasis - - async def set_language(self, language: str): - logger.debug(f"Setting TTS language code to: [{language}]") - self._params.language = language - - async def set_pitch(self, pitch: str): - logger.debug(f"Setting TTS pitch to: [{pitch}]") - self._params.pitch = pitch - - async def set_rate(self, rate: str): - logger.debug(f"Setting TTS rate to: [{rate}]") - self._params.rate = rate - - async def set_role(self, role: str): - logger.debug(f"Setting TTS role to: [{role}]") - self._params.role = role - - async def set_style(self, style: str): - logger.debug(f"Setting TTS style to: [{style}]") - self._params.style = style - - async def set_style_degree(self, style_degree: str): - logger.debug(f"Setting TTS style degree to: [{style_degree}]") - self._params.style_degree = style_degree - - async def set_volume(self, volume: str): - logger.debug(f"Setting TTS volume to: [{volume}]") - self._params.volume = volume - - async def set_params(self, **kwargs): - valid_params = { - "voice": self.set_voice, - "emphasis": self.set_emphasis, - "language_code": self.set_language, - "pitch": self.set_pitch, - "rate": self.set_rate, - "role": self.set_role, - "style": self.set_style, - "style_degree": self.set_style_degree, - "volume": self.set_volume, - } - - for param, value in kwargs.items(): - if param in valid_params: - await valid_params[param](value) - else: - logger.warning(f"Ignoring unknown parameter: {param}") - - logger.debug(f"Updated TTS parameters: {', '.join(kwargs.keys())}") - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") @@ -222,7 +172,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.push_frame(TTSStartedFrame()) # Azure always sends a 44-byte header. Strip it off. yield TTSAudioRawFrame( - audio=result.audio_data[44:], sample_rate=self._sample_rate, num_channels=1 + audio=result.audio_data[44:], + sample_rate=self._settings["sample_rate"], + num_channels=1, ) await self.push_frame(TTSStoppedFrame()) elif result.reason == ResultReason.Canceled: diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 5f798b1e5..0817879c4 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -4,36 +4,35 @@ # SPDX-License-Identifier: BSD 2-Clause License # +import asyncio +import base64 import json import uuid -import base64 -import asyncio +from typing import AsyncGenerator, List, Optional, Union -from typing import AsyncGenerator, Optional, Union, List +from loguru import logger from pydantic.main import BaseModel from pipecat.frames.frames import ( CancelFrame, + EndFrame, ErrorFrame, Frame, - StartInterruptionFrame, + LLMFullResponseEndFrame, StartFrame, - EndFrame, + StartInterruptionFrame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, - LLMFullResponseEndFrame, ) from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.ai_services import TTSService, WordTTSService from pipecat.transcriptions.language import Language -from pipecat.services.ai_services import WordTTSService, TTSService - -from loguru import logger # See .env.example for Cartesia configuration needed try: - from cartesia import AsyncCartesia import websockets + from cartesia import AsyncCartesia except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( @@ -66,7 +65,7 @@ class InputParams(BaseModel): encoding: Optional[str] = "pcm_s16le" sample_rate: Optional[int] = 16000 container: Optional[str] = "raw" - language: Optional[str] = "en" + language: Optional[Language] = Language.EN speed: Optional[Union[str, float]] = "" emotion: Optional[List[str]] = [] @@ -77,7 +76,7 @@ def __init__( voice_id: str, cartesia_version: str = "2024-06-10", url: str = "wss://api.cartesia.ai/tts/websocket", - model_id: str = "sonic-english", + model: str = "sonic-english", params: InputParams = InputParams(), **kwargs, ): @@ -101,17 +100,18 @@ def __init__( self._api_key = api_key self._cartesia_version = cartesia_version self._url = url - self._voice_id = voice_id - self._model_id = model_id - self.set_model_name(model_id) - self._output_format = { - "container": params.container, - "encoding": params.encoding, - "sample_rate": params.sample_rate, + self._settings = { + "output_format": { + "container": params.container, + "encoding": params.encoding, + "sample_rate": params.sample_rate, + }, + "language": language_to_cartesia_language(params.language) if params.language else None, + "speed": params.speed, + "emotion": params.emotion, } - self._language = params.language - self._speed = params.speed - self._emotion = params.emotion + self.set_model_name(model) + self.set_voice(voice_id) self._websocket = None self._context_id = None @@ -125,42 +125,28 @@ async def set_model(self, model: str): await super().set_model(model) logger.debug(f"Switching TTS model to: [{model}]") - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - - async def set_speed(self, speed: str): - logger.debug(f"Switching TTS speed to: [{speed}]") - self._speed = speed - - async def set_emotion(self, emotion: list[str]): - logger.debug(f"Switching TTS emotion to: [{emotion}]") - self._emotion = emotion - - async def set_language(self, language: Language): - logger.debug(f"Switching TTS language to: [{language}]") - self._language = language_to_cartesia_language(language) - def _build_msg( self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True ): - voice_config = {"mode": "id", "id": self._voice_id} + voice_config = {} + voice_config["mode"] = "id" + voice_config["id"] = self._voice_id - if self._speed or self._emotion: + if self._settings["speed"] or self._settings["emotion"]: voice_config["__experimental_controls"] = {} - if self._speed: - voice_config["__experimental_controls"]["speed"] = self._speed - if self._emotion: - voice_config["__experimental_controls"]["emotion"] = self._emotion + if self._settings["speed"]: + voice_config["__experimental_controls"]["speed"] = self._settings["speed"] + if self._settings["emotion"]: + voice_config["__experimental_controls"]["emotion"] = self._settings["emotion"] msg = { "transcript": text, "continue": continue_transcript, "context_id": self._context_id, - "model_id": self._model_name, + "model_id": self.model_name, "voice": voice_config, - "output_format": self._output_format, - "language": self._language, + "output_format": self._settings["output_format"], + "language": self._settings["language"], "add_timestamps": add_timestamps, } return json.dumps(msg) @@ -245,7 +231,7 @@ async def _receive_task_handler(self): self.start_word_timestamps() frame = TTSAudioRawFrame( audio=base64.b64decode(msg["data"]), - sample_rate=self._output_format["sample_rate"], + sample_rate=self._settings["output_format"]["sample_rate"], num_channels=1, ) await self.push_frame(frame) @@ -303,7 +289,7 @@ def __init__( *, api_key: str, voice_id: str, - model_id: str = "sonic-english", + model: str = "sonic-english", base_url: str = "https://api.cartesia.ai", params: InputParams = InputParams(), **kwargs, @@ -311,17 +297,18 @@ def __init__( super().__init__(**kwargs) self._api_key = api_key - self._voice_id = voice_id - self._model_id = model_id - self.set_model_name(model_id) - self._output_format = { - "container": params.container, - "encoding": params.encoding, - "sample_rate": params.sample_rate, + self._settings = { + "output_format": { + "container": params.container, + "encoding": params.encoding, + "sample_rate": params.sample_rate, + }, + "language": params.language, + "speed": params.speed, + "emotion": params.emotion, } - self._language = params.language - self._speed = params.speed - self._emotion = params.emotion + self.set_voice(voice_id) + self.set_model_name(model) self._client = AsyncCartesia(api_key=api_key, base_url=base_url) @@ -333,22 +320,6 @@ async def set_model(self, model: str): self._model_id = model await super().set_model(model) - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - - async def set_speed(self, speed: str): - logger.debug(f"Switching TTS speed to: [{speed}]") - self._speed = speed - - async def set_emotion(self, emotion: list[str]): - logger.debug(f"Switching TTS emotion to: [{emotion}]") - self._emotion = emotion - - async def set_language(self, language: Language): - logger.debug(f"Switching TTS language to: [{language}]") - self._language = language_to_cartesia_language(language) - async def stop(self, frame: EndFrame): await super().stop(frame) await self._client.close() @@ -365,19 +336,19 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: try: voice_controls = None - if self._speed or self._emotion: + if self._settings["speed"] or self._settings["emotion"]: voice_controls = {} - if self._speed: - voice_controls["speed"] = self._speed - if self._emotion: - voice_controls["emotion"] = self._emotion + if self._settings["speed"]: + voice_controls["speed"] = self._settings["speed"] + if self._settings["emotion"]: + voice_controls["emotion"] = self._settings["emotion"] output = await self._client.tts.sse( model_id=self._model_id, transcript=text, voice_id=self._voice_id, - output_format=self._output_format, - language=self._language, + output_format=self._settings["output_format"], + language=self._settings["language"], stream=False, _experimental_voice_controls=voice_controls, ) @@ -386,7 +357,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: frame = TTSAudioRawFrame( audio=output["audio"], - sample_rate=self._output_format["sample_rate"], + sample_rate=self._settings["output_format"]["sample_rate"], num_channels=1, ) yield frame diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index d109cce3c..40fe0168d 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -5,9 +5,10 @@ # import asyncio - from typing import AsyncGenerator +from loguru import logger + from pipecat.frames.frames import ( CancelFrame, EndFrame, @@ -24,8 +25,6 @@ from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 -from loguru import logger - # See .env.example for Deepgram configuration needed try: from deepgram import ( @@ -50,32 +49,30 @@ def __init__( self, *, api_key: str, - voice: str = "aura-helios-en", + voice_id: str = "aura-helios-en", sample_rate: int = 16000, encoding: str = "linear16", **kwargs, ): super().__init__(**kwargs) - self._voice = voice - self._sample_rate = sample_rate - self._encoding = encoding + self._settings = { + "sample_rate": sample_rate, + "encoding": encoding, + } + self.set_voice(voice_id) self._deepgram_client = DeepgramClient(api_key=api_key) def can_generate_metrics(self) -> bool: return True - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice = voice - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") options = SpeakOptions( - model=self._voice, - encoding=self._encoding, - sample_rate=self._sample_rate, + model=self._voice_id, + encoding=self._settings["encoding"], + sample_rate=self._settings["sample_rate"], container="none", ) @@ -103,7 +100,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: chunk = audio_buffer.read(chunk_size) if not chunk: break - frame = TTSAudioRawFrame(audio=chunk, sample_rate=self._sample_rate, num_channels=1) + frame = TTSAudioRawFrame( + audio=chunk, sample_rate=self._settings["sample_rate"], num_channels=1 + ) yield frame await self.push_frame(TTSStoppedFrame()) @@ -135,7 +134,7 @@ def __init__( ): super().__init__(**kwargs) - self._live_options = live_options + self._settings = vars(live_options) self._client = DeepgramClient( api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"}) @@ -147,7 +146,7 @@ def __init__( @property def vad_enabled(self): - return self._live_options.vad_events + return self._settings["vad_events"] def can_generate_metrics(self) -> bool: return self.vad_enabled @@ -155,13 +154,7 @@ def can_generate_metrics(self) -> bool: async def set_model(self, model: str): await super().set_model(model) logger.debug(f"Switching STT model to: [{model}]") - self._live_options.model = model - await self._disconnect() - await self._connect() - - async def set_language(self, language: Language): - logger.debug(f"Switching STT language to: [{language}]") - self._live_options.language = language + self._settings["model"] = model await self._disconnect() await self._connect() @@ -182,7 +175,7 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: yield None async def _connect(self): - if await self._connection.start(self._live_options): + if await self._connection.start(self._settings): logger.debug(f"{self}: Connected to Deepgram") else: logger.error(f"{self}: Unable to connect to Deepgram") diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py index 611f2a024..871b3eec6 100644 --- a/src/pipecat/services/elevenlabs.py +++ b/src/pipecat/services/elevenlabs.py @@ -24,6 +24,7 @@ ) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import WordTTSService +from pipecat.transcriptions.language import Language # See .env.example for ElevenLabs configuration needed try: @@ -72,7 +73,7 @@ def calculate_word_times( class ElevenLabsTTSService(WordTTSService): class InputParams(BaseModel): - language: Optional[str] = None + language: Optional[Language] = Language.EN output_format: Literal["pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100"] = "pcm_16000" optimize_streaming_latency: Optional[str] = None stability: Optional[float] = None @@ -124,10 +125,19 @@ def __init__( ) self._api_key = api_key - self._voice_id = voice_id - self.set_model_name(model) self._url = url - self._params = params + self._settings = { + "sample_rate": sample_rate_from_output_format(params.output_format), + "language": params.language, + "output_format": params.output_format, + "optimize_streaming_latency": params.optimize_streaming_latency, + "stability": params.stability, + "similarity_boost": params.similarity_boost, + "style": params.style, + "use_speaker_boost": params.use_speaker_boost, + } + self.set_model_name(model) + self.set_voice(voice_id) self._voice_settings = self._set_voice_settings() # Websocket connection to ElevenLabs. @@ -142,19 +152,22 @@ def can_generate_metrics(self) -> bool: def _set_voice_settings(self): voice_settings = {} - if self._params.stability is not None and self._params.similarity_boost is not None: - voice_settings["stability"] = self._params.stability - voice_settings["similarity_boost"] = self._params.similarity_boost - if self._params.style is not None: - voice_settings["style"] = self._params.style - if self._params.use_speaker_boost is not None: - voice_settings["use_speaker_boost"] = self._params.use_speaker_boost + if ( + self._settings["stability"] is not None + and self._settings["similarity_boost"] is not None + ): + voice_settings["stability"] = self._settings["stability"] + voice_settings["similarity_boost"] = self._settings["similarity_boost"] + if self._settings["style"] is not None: + voice_settings["style"] = self._settings["style"] + if self._settings["use_speaker_boost"] is not None: + voice_settings["use_speaker_boost"] = self._settings["use_speaker_boost"] else: - if self._params.style is not None: + if self._settings["style"] is not None: logger.warning( "'style' is set but will not be applied because 'stability' and 'similarity_boost' are not both set." ) - if self._params.use_speaker_boost is not None: + if self._settings["use_speaker_boost"] is not None: logger.warning( "'use_speaker_boost' is set but will not be applied because 'stability' and 'similarity_boost' are not both set." ) @@ -167,33 +180,13 @@ async def set_model(self, model: str): await self._disconnect() await self._connect() - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - await self._disconnect() - await self._connect() - - async def set_voice_settings( - self, - stability: Optional[float] = None, - similarity_boost: Optional[float] = None, - style: Optional[float] = None, - use_speaker_boost: Optional[bool] = None, - ): - self._params.stability = stability if stability is not None else self._params.stability - self._params.similarity_boost = ( - similarity_boost if similarity_boost is not None else self._params.similarity_boost - ) - self._params.style = style if style is not None else self._params.style - self._params.use_speaker_boost = ( - use_speaker_boost if use_speaker_boost is not None else self._params.use_speaker_boost - ) - - self._set_voice_settings() - - if self._websocket: - msg = {"voice_settings": self._voice_settings} - await self._websocket.send(json.dumps(msg)) + async def _update_settings(self, settings: Dict[str, Any]): + prev_voice = self._voice_id + await super()._update_settings(settings) + if not prev_voice == self._voice_id: + await self._disconnect() + await self._connect() + logger.debug(f"Switching TTS voice to: [{self._voice_id}]") async def start(self, frame: StartFrame): await super().start(frame) @@ -223,19 +216,19 @@ async def _connect(self): try: voice_id = self._voice_id model = self.model_name - output_format = self._params.output_format + output_format = self._settings["output_format"] url = f"{self._url}/v1/text-to-speech/{voice_id}/stream-input?model_id={model}&output_format={output_format}" - if self._params.optimize_streaming_latency: - url += f"&optimize_streaming_latency={self._params.optimize_streaming_latency}" + if self._settings["optimize_streaming_latency"]: + url += f"&optimize_streaming_latency={self._settings["optimize_streaming_latency"]}" # language can only be used with the 'eleven_turbo_v2_5' model - if self._params.language: + if self._settings["language"]: if model == "eleven_turbo_v2_5": - url += f"&language_code={self._params.language}" + url += f"&language_code={self._settings["language"]}" else: logger.debug( - f"Language code [{self._params.language}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model." + f"Language code [{self._settings["language"]}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model." ) self._websocket = await websockets.connect(url) @@ -286,7 +279,7 @@ async def _receive_task_handler(self): self.start_word_timestamps() audio = base64.b64decode(msg["audio"]) - frame = TTSAudioRawFrame(audio, self.sample_rate, 1) + frame = TTSAudioRawFrame(audio, self._settings["sample_rate"], 1) await self.push_frame(frame) if msg.get("alignment"): diff --git a/src/pipecat/services/gladia.py b/src/pipecat/services/gladia.py index a590d73cf..16f3dab97 100644 --- a/src/pipecat/services/gladia.py +++ b/src/pipecat/services/gladia.py @@ -6,8 +6,9 @@ import base64 import json - from typing import AsyncGenerator, Optional + +from loguru import logger from pydantic.main import BaseModel from pipecat.frames.frames import ( @@ -21,8 +22,6 @@ from pipecat.services.ai_services import STTService from pipecat.utils.time import time_now_iso8601 -from loguru import logger - # See .env.example for Gladia configuration needed try: import websockets @@ -55,7 +54,13 @@ def __init__( self._api_key = api_key self._url = url - self._params = params + self._settings = { + "sample_rate": params.sample_rate, + "language": params.language, + "transcription_hint": params.transcription_hint, + "endpointing": params.endpointing, + "prosody": params.prosody, + } self._confidence = confidence async def start(self, frame: StartFrame): @@ -84,7 +89,11 @@ async def _setup_gladia(self): "encoding": "WAV/PCM", "model_type": "fast", "language_behaviour": "manual", - **self._params.model_dump(exclude_none=True), + "sample_rate": self._settings["sample_rate"], + "language": self._settings["language"], + "transcription_hint": self._settings["transcription_hint"], + "endpointing": self._settings["endpointing"], + "prosody": self._settings["prosody"], } await self._websocket.send(json.dumps(configuration)) diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index 092b4703e..05fff2056 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -6,7 +6,7 @@ import asyncio import json -from typing import Any, AsyncGenerator, Dict, List, Literal, Optional +from typing import AsyncGenerator, List, Literal, Optional from loguru import logger from pydantic import BaseModel @@ -17,7 +17,7 @@ LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesFrame, - ServiceUpdateSettingsFrame, + LLMUpdateSettingsFrame, TextFrame, TTSAudioRawFrame, TTSStartedFrame, @@ -64,21 +64,6 @@ def _create_client(self, model: str): self.set_model_name(model) self._client = gai.GenerativeModel(model) - async def set_model(self, model: str): - logger.debug(f"Switching LLM model to: [{model}]") - self._create_client(model) - - async def _update_settings(self, settings: Dict[str, Any]): - for key, value in settings.items(): - setter = getattr(self, f"set_{key}", None) - if setter and callable(setter): - try: - await setter(value) - except Exception as e: - logger.warning(f"Error setting {key}: {e}") - else: - logger.warning(f"Unknown setting for Google LLM service: {key}") - def _get_messages_from_openai_context(self, context: OpenAILLMContext) -> List[glm.Content]: openai_messages = context.get_messages() google_messages = [] @@ -151,7 +136,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): context = OpenAILLMContext.from_messages(frame.messages) elif isinstance(frame, VisionImageRawFrame): context = OpenAILLMContext.from_image_frame(frame) - elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm": + elif isinstance(frame, LLMUpdateSettingsFrame): await self._update_settings(frame.settings) else: await self.push_frame(frame, direction) @@ -182,8 +167,17 @@ def __init__( ): super().__init__(sample_rate=sample_rate, **kwargs) - self._voice_id: str = voice_id - self._params = params + self._settings = { + "sample_rate": sample_rate, + "pitch": params.pitch, + "rate": params.rate, + "volume": params.volume, + "emphasis": params.emphasis, + "language": params.language, + "gender": params.gender, + "google_style": params.google_style, + } + self.set_voice(voice_id) self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client( credentials, credentials_path ) @@ -216,38 +210,38 @@ def _construct_ssml(self, text: str) -> str: # Voice tag voice_attrs = [f"name='{self._voice_id}'"] - if self._params.language: - voice_attrs.append(f"language='{self._params.language}'") - if self._params.gender: - voice_attrs.append(f"gender='{self._params.gender}'") + if self._settings["language"]: + voice_attrs.append(f"language='{self._settings['language']}'") + if self._settings["gender"]: + voice_attrs.append(f"gender='{self._settings['gender']}'") ssml += f"" # Prosody tag prosody_attrs = [] - if self._params.pitch: - prosody_attrs.append(f"pitch='{self._params.pitch}'") - if self._params.rate: - prosody_attrs.append(f"rate='{self._params.rate}'") - if self._params.volume: - prosody_attrs.append(f"volume='{self._params.volume}'") + if self._settings["pitch"]: + prosody_attrs.append(f"pitch='{self._settings['pitch']}'") + if self._settings["rate"]: + prosody_attrs.append(f"rate='{self._settings['rate']}'") + if self._settings["volume"]: + prosody_attrs.append(f"volume='{self._settings['volume']}'") if prosody_attrs: ssml += f"" # Emphasis tag - if self._params.emphasis: - ssml += f"" + if self._settings["emphasis"]: + ssml += f"" # Google style tag - if self._params.google_style: - ssml += f"" + if self._settings["google_style"]: + ssml += f"" ssml += text # Close tags - if self._params.google_style: + if self._settings["google_style"]: ssml += "" - if self._params.emphasis: + if self._settings["emphasis"]: ssml += "" if prosody_attrs: ssml += "" @@ -255,46 +249,6 @@ def _construct_ssml(self, text: str) -> str: return ssml - async def set_voice(self, voice: str) -> None: - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - - async def set_language(self, language: str) -> None: - logger.debug(f"Switching TTS language to: [{language}]") - self._params.language = language - - async def set_pitch(self, pitch: str) -> None: - logger.debug(f"Switching TTS pitch to: [{pitch}]") - self._params.pitch = pitch - - async def set_rate(self, rate: str) -> None: - logger.debug(f"Switching TTS rate to: [{rate}]") - self._params.rate = rate - - async def set_volume(self, volume: str) -> None: - logger.debug(f"Switching TTS volume to: [{volume}]") - self._params.volume = volume - - async def set_emphasis( - self, emphasis: Literal["strong", "moderate", "reduced", "none"] - ) -> None: - logger.debug(f"Switching TTS emphasis to: [{emphasis}]") - self._params.emphasis = emphasis - - async def set_gender(self, gender: Literal["male", "female", "neutral"]) -> None: - logger.debug(f"Switch TTS gender to [{gender}]") - self._params.gender = gender - - async def set_google_style( - self, google_style: Literal["apologetic", "calm", "empathetic", "firm", "lively"] - ) -> None: - logger.debug(f"Switching TTS google style to: [{google_style}]") - self._params.google_style = google_style - - async def set_params(self, params: InputParams) -> None: - logger.debug(f"Switching TTS params to: [{params}]") - self._params = params - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") @@ -304,11 +258,11 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: ssml = self._construct_ssml(text) synthesis_input = texttospeech_v1.SynthesisInput(ssml=ssml) voice = texttospeech_v1.VoiceSelectionParams( - language_code=self._params.language, name=self._voice_id + language_code=self._settings["language"], name=self._voice_id ) audio_config = texttospeech_v1.AudioConfig( audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16, - sample_rate_hertz=self.sample_rate, + sample_rate_hertz=self._settings["sample_rate"], ) request = texttospeech_v1.SynthesizeSpeechRequest( @@ -331,7 +285,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: if not chunk: break await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame(chunk, self.sample_rate, 1) + frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1) yield frame await asyncio.sleep(0) # Allow other tasks to run diff --git a/src/pipecat/services/lmnt.py b/src/pipecat/services/lmnt.py index 8f18002c5..c828e7a7a 100644 --- a/src/pipecat/services/lmnt.py +++ b/src/pipecat/services/lmnt.py @@ -5,10 +5,10 @@ # import asyncio - from typing import AsyncGenerator -from pipecat.processors.frame_processor import FrameDirection +from loguru import logger + from pipecat.frames.frames import ( CancelFrame, EndFrame, @@ -20,10 +20,9 @@ TTSStartedFrame, TTSStoppedFrame, ) +from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import TTSService -from loguru import logger - # See .env.example for LMNT configuration needed try: from lmnt.api import Speech @@ -50,13 +49,16 @@ def __init__( super().__init__(push_stop_frames=True, sample_rate=sample_rate, **kwargs) self._api_key = api_key - self._voice_id = voice_id - self._output_format = { - "container": "raw", - "encoding": "pcm_s16le", - "sample_rate": sample_rate, + self._settings = { + "output_format": { + "container": "raw", + "encoding": "pcm_s16le", + "sample_rate": sample_rate, + }, + "language": language, } - self._language = language + + self.set_voice(voice_id) self._speech = None self._connection = None @@ -68,10 +70,6 @@ def __init__( def can_generate_metrics(self) -> bool: return True - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - async def start(self, frame: StartFrame): await super().start(frame) await self._connect() @@ -93,7 +91,9 @@ async def _connect(self): try: self._speech = Speech() self._connection = await self._speech.synthesize_streaming( - self._voice_id, format="raw", sample_rate=self._output_format["sample_rate"] + self._voice_id, + format="raw", + sample_rate=self._settings["output_format"]["sample_rate"], ) self._receive_task = self.get_event_loop().create_task(self._receive_task_handler()) except Exception as e: @@ -130,7 +130,7 @@ async def _receive_task_handler(self): await self.stop_ttfb_metrics() frame = TTSAudioRawFrame( audio=msg["audio"], - sample_rate=self._output_format["sample_rate"], + sample_rate=self._settings["output_format"]["sample_rate"], num_channels=1, ) await self.push_frame(frame) diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index 57c1b3399..68b7fa4de 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -24,7 +24,7 @@ LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesFrame, - ServiceUpdateSettingsFrame, + LLMUpdateSettingsFrame, StartInterruptionFrame, TextFrame, TTSAudioRawFrame, @@ -111,14 +111,16 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + self._settings = { + "frequency_penalty": params.frequency_penalty, + "presence_penalty": params.presence_penalty, + "seed": params.seed, + "temperature": params.temperature, + "top_p": params.top_p, + "extra": params.extra if isinstance(params.extra, dict) else {}, + } self.set_model_name(model) self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs) - self._frequency_penalty = params.frequency_penalty - self._presence_penalty = params.presence_penalty - self._seed = params.seed - self._temperature = params.temperature - self._top_p = params.top_p - self._extra = params.extra if isinstance(params.extra, dict) else {} def create_client(self, api_key=None, base_url=None, **kwargs): return AsyncOpenAI( @@ -134,30 +136,6 @@ def create_client(self, api_key=None, base_url=None, **kwargs): def can_generate_metrics(self) -> bool: return True - async def set_frequency_penalty(self, frequency_penalty: float): - logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]") - self._frequency_penalty = frequency_penalty - - async def set_presence_penalty(self, presence_penalty: float): - logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]") - self._presence_penalty = presence_penalty - - async def set_seed(self, seed: int): - logger.debug(f"Switching LLM seed to: [{seed}]") - self._seed = seed - - async def set_temperature(self, temperature: float): - logger.debug(f"Switching LLM temperature to: [{temperature}]") - self._temperature = temperature - - async def set_top_p(self, top_p: float): - logger.debug(f"Switching LLM top_p to: [{top_p}]") - self._top_p = top_p - - async def set_extra(self, extra: Dict[str, Any]): - logger.debug(f"Switching LLM extra to: [{extra}]") - self._extra = extra - async def get_chat_completions( self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam] ) -> AsyncStream[ChatCompletionChunk]: @@ -168,14 +146,14 @@ async def get_chat_completions( "tools": context.tools, "tool_choice": context.tool_choice, "stream_options": {"include_usage": True}, - "frequency_penalty": self._frequency_penalty, - "presence_penalty": self._presence_penalty, - "seed": self._seed, - "temperature": self._temperature, - "top_p": self._top_p, + "frequency_penalty": self._settings["frequency_penalty"], + "presence_penalty": self._settings["presence_penalty"], + "seed": self._settings["seed"], + "temperature": self._settings["temperature"], + "top_p": self._settings["top_p"], } - params.update(self._extra) + params.update(self._settings["extra"]) chunks = await self._client.chat.completions.create(**params) return chunks @@ -295,17 +273,6 @@ async def _process_context(self, context: OpenAILLMContext): f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function." ) - async def _update_settings(self, settings: Dict[str, Any]): - for key, value in settings.items(): - setter = getattr(self, f"set_{key}", None) - if setter and callable(setter): - try: - await setter(value) - except Exception as e: - logger.warning(f"Error setting {key}: {e}") - else: - logger.warning(f"Unknown setting for OpenAI LLM service: {key}") - async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -316,7 +283,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): context = OpenAILLMContext.from_messages(frame.messages) elif isinstance(frame, VisionImageRawFrame): context = OpenAILLMContext.from_image_frame(frame) - elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm": + elif isinstance(frame, LLMUpdateSettingsFrame): await self._update_settings(frame.settings) else: await self.push_frame(frame, direction) @@ -414,29 +381,27 @@ def __init__( self, *, api_key: str | None = None, - voice: str = "alloy", + voice_id: str = "alloy", model: Literal["tts-1", "tts-1-hd"] = "tts-1", sample_rate: int = 24000, **kwargs, ): super().__init__(sample_rate=sample_rate, **kwargs) - self._voice: ValidVoice = VALID_VOICES.get(voice, "alloy") + self._settings = { + "sample_rate": sample_rate, + } self.set_model_name(model) - self._sample_rate = sample_rate + self.set_voice(voice_id) self._client = AsyncOpenAI(api_key=api_key) def can_generate_metrics(self) -> bool: return True - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice = VALID_VOICES.get(voice, self._voice) - async def set_model(self, model: str): logger.debug(f"Switching TTS model to: [{model}]") - self._model = model + self.set_model_name(model) async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") @@ -446,7 +411,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: async with self._client.audio.speech.with_streaming_response.create( input=text, model=self.model_name, - voice=self._voice, + voice=VALID_VOICES[self._voice_id], response_format="pcm", ) as r: if r.status_code != 200: @@ -465,7 +430,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: async for chunk in r.iter_bytes(8192): if len(chunk) > 0: await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame(chunk, self.sample_rate, 1) + frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1) yield frame await self.push_frame(TTSStoppedFrame()) except BadRequestError as e: diff --git a/src/pipecat/services/playht.py b/src/pipecat/services/playht.py index 2ffa3a419..aea5d8d92 100644 --- a/src/pipecat/services/playht.py +++ b/src/pipecat/services/playht.py @@ -6,17 +6,21 @@ import io import struct - from typing import AsyncGenerator -from pipecat.frames.frames import Frame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame -from pipecat.services.ai_services import TTSService - from loguru import logger +from pipecat.frames.frames import ( + Frame, + TTSAudioRawFrame, + TTSStartedFrame, + TTSStoppedFrame, +) +from pipecat.services.ai_services import TTSService + try: - from pyht.client import TTSOptions from pyht.async_client import AsyncClient + from pyht.client import TTSOptions from pyht.protos.api_pb2 import Format except ModuleNotFoundError as e: logger.error(f"Exception: {e}") @@ -28,7 +32,7 @@ class PlayHTTTSService(TTSService): def __init__( - self, *, api_key: str, user_id: str, voice_url: str, sample_rate: int = 16000, **kwargs + self, *, api_key: str, user_id: str, voice_id: str, sample_rate: int = 16000, **kwargs ): super().__init__(sample_rate=sample_rate, **kwargs) @@ -39,17 +43,23 @@ def __init__( user_id=self._user_id, api_key=self._speech_key, ) + self._settings = { + "sample_rate": sample_rate, + "quality": "higher", + "format": Format.FORMAT_WAV, + "voice_engine": "PlayHT2.0-turbo", + } + self.set_voice(voice_id) self._options = TTSOptions( - voice=voice_url, sample_rate=sample_rate, quality="higher", format=Format.FORMAT_WAV + voice=self._voice_id, + sample_rate=self._settings["sample_rate"], + quality=self._settings["quality"], + format=self._settings["format"], ) def can_generate_metrics(self) -> bool: return True - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._options.voice = voice - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") @@ -60,7 +70,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() playht_gen = self._client.tts( - text, voice_engine="PlayHT2.0-turbo", options=self._options + text, voice_engine=self._settings["voice_engine"], options=self._options ) await self.start_tts_usage_metrics(text) @@ -83,7 +93,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: else: if len(chunk): await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame(chunk, 16000, 1) + frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1) yield frame await self.push_frame(TTSStoppedFrame()) except Exception as e: diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py index 26a1a99fd..1bf74e508 100644 --- a/src/pipecat/services/together.py +++ b/src/pipecat/services/together.py @@ -50,13 +50,15 @@ def __init__( ): super().__init__(api_key=api_key, base_url=base_url, model=model, params=params, **kwargs) self.set_model_name(model) - self._max_tokens = params.max_tokens - self._frequency_penalty = params.frequency_penalty - self._presence_penalty = params.presence_penalty - self._temperature = params.temperature - self._top_k = params.top_k - self._top_p = params.top_p - self._extra = params.extra if isinstance(params.extra, dict) else {} + self._settings = { + "max_tokens": params.max_tokens, + "frequency_penalty": params.frequency_penalty, + "presence_penalty": params.presence_penalty, + "seed": params.seed, + "temperature": params.temperature, + "top_p": params.top_p, + "extra": params.extra if isinstance(params.extra, dict) else {}, + } def can_generate_metrics(self) -> bool: return True @@ -72,42 +74,3 @@ def create_client(self, api_key=None, base_url=None, **kwargs): ) ), ) - - async def set_frequency_penalty(self, frequency_penalty: float): - logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]") - self._frequency_penalty = frequency_penalty - - async def set_max_tokens(self, max_tokens: int): - logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]") - self._max_tokens = max_tokens - - async def set_presence_penalty(self, presence_penalty: float): - logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]") - self._presence_penalty = presence_penalty - - async def set_temperature(self, temperature: float): - logger.debug(f"Switching LLM temperature to: [{temperature}]") - self._temperature = temperature - - async def set_top_k(self, top_k: float): - logger.debug(f"Switching LLM top_k to: [{top_k}]") - self._top_k = top_k - - async def set_top_p(self, top_p: float): - logger.debug(f"Switching LLM top_p to: [{top_p}]") - self._top_p = top_p - - async def set_extra(self, extra: Dict[str, Any]): - logger.debug(f"Switching LLM extra to: [{extra}]") - self._extra = extra - - async def _update_settings(self, settings: Dict[str, Any]): - for key, value in settings.items(): - setter = getattr(self, f"set_{key}", None) - if setter and callable(setter): - try: - await setter(value) - except Exception as e: - logger.warning(f"Error setting {key}: {e}") - else: - logger.warning(f"Unknown setting for Together LLM service: {key}") diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py index 2c47d59e8..7826cfcd8 100644 --- a/src/pipecat/services/xtts.py +++ b/src/pipecat/services/xtts.py @@ -4,10 +4,12 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import aiohttp - from typing import Any, AsyncGenerator, Dict +import aiohttp +import numpy as np +from loguru import logger + from pipecat.frames.frames import ( ErrorFrame, Frame, @@ -18,10 +20,6 @@ ) from pipecat.services.ai_services import TTSService -import numpy as np - -from loguru import logger - try: import resampy except ModuleNotFoundError as e: @@ -50,9 +48,11 @@ def __init__( ): super().__init__(**kwargs) - self._voice_id = voice_id - self._language = language - self._base_url = base_url + self._settings = { + "language": language, + "base_url": base_url, + } + self.set_voice(voice_id) self._studio_speakers: Dict[str, Any] | None = None self._aiohttp_session = aiohttp_session @@ -61,7 +61,7 @@ def can_generate_metrics(self) -> bool: async def start(self, frame: StartFrame): await super().start(frame) - async with self._aiohttp_session.get(self._base_url + "/studio_speakers") as r: + async with self._aiohttp_session.get(self._settings["base_url"] + "/studio_speakers") as r: if r.status != 200: text = await r.text() logger.error( @@ -75,10 +75,6 @@ async def start(self, frame: StartFrame): return self._studio_speakers = await r.json() - async def set_voice(self, voice: str): - logger.debug(f"Switching TTS voice to: [{voice}]") - self._voice_id = voice - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]") @@ -88,11 +84,11 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: embeddings = self._studio_speakers[self._voice_id] - url = self._base_url + "/tts_stream" + url = self._settings["base_url"] + "/tts_stream" payload = { "text": text.replace(".", "").replace("*", ""), - "language": self._language, + "language": self._settings["language"], "speaker_embedding": embeddings["speaker_embedding"], "gpt_cond_latent": embeddings["gpt_cond_latent"], "add_wav_header": False, From d75a02dc51d780ea6b95d4d548db59836a3dc140 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 1 Oct 2024 21:01:44 -0400 Subject: [PATCH 4/5] Use Language enum and set languages accordingly --- src/pipecat/services/aws.py | 66 +++++++++++++++++++- src/pipecat/services/azure.py | 96 +++++++++++++++++++++++++++++- src/pipecat/services/cartesia.py | 6 +- src/pipecat/services/deepgram.py | 2 +- src/pipecat/services/elevenlabs.py | 74 ++++++++++++++++++++++- src/pipecat/services/gladia.py | 83 +++++++++++++++++++++++++- src/pipecat/services/google.py | 94 ++++++++++++++++++++++++++++- src/pipecat/services/lmnt.py | 31 +++++++++- src/pipecat/services/xtts.py | 49 ++++++++++++++- 9 files changed, 484 insertions(+), 17 deletions(-) diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py index 7004c21f7..210e0e700 100644 --- a/src/pipecat/services/aws.py +++ b/src/pipecat/services/aws.py @@ -17,6 +17,7 @@ TTSStoppedFrame, ) from pipecat.services.ai_services import TTSService +from pipecat.transcriptions.language import Language try: import boto3 @@ -29,10 +30,71 @@ raise Exception(f"Missing module: {e}") +def language_to_aws_language(language: Language) -> str | None: + match language: + case Language.CA: + return "ca-ES" + case Language.ZH: + return "cmn-CN" + case Language.DA: + return "da-DK" + case Language.NL: + return "nl-NL" + case Language.NL_BE: + return "nl-BE" + case Language.EN: + return "en-US" + case Language.EN_US: + return "en-US" + case Language.EN_AU: + return "en-AU" + case Language.EN_GB: + return "en-GB" + case Language.EN_NZ: + return "en-NZ" + case Language.EN_IN: + return "en-IN" + case Language.FI: + return "fi-FI" + case Language.FR: + return "fr-FR" + case Language.FR_CA: + return "fr-CA" + case Language.DE: + return "de-DE" + case Language.HI: + return "hi-IN" + case Language.IT: + return "it-IT" + case Language.JA: + return "ja-JP" + case Language.KO: + return "ko-KR" + case Language.NO: + return "nb-NO" + case Language.PL: + return "pl-PL" + case Language.PT: + return "pt-PT" + case Language.PT_BR: + return "pt-BR" + case Language.RO: + return "ro-RO" + case Language.RU: + return "ru-RU" + case Language.ES: + return "es-ES" + case Language.SV: + return "sv-SE" + case Language.TR: + return "tr-TR" + return None + + class AWSTTSService(TTSService): class InputParams(BaseModel): engine: Optional[str] = None - language: Optional[str] = None + language: Optional[Language] = Language.EN pitch: Optional[str] = None rate: Optional[str] = None volume: Optional[str] = None @@ -59,7 +121,7 @@ def __init__( self._settings = { "sample_rate": sample_rate, "engine": params.engine, - "language": params.language, + "language": language_to_aws_language(params.language) if params.language else "en-US", "pitch": params.pitch, "rate": params.rate, "volume": params.volume, diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index 1b2b9a3f2..078b7df6b 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -27,6 +27,7 @@ ) from pipecat.services.ai_services import ImageGenService, STTService, TTSService from pipecat.services.openai import BaseOpenAILLMService +from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 # See .env.example for Azure configuration needed @@ -70,10 +71,101 @@ def create_client(self, api_key=None, base_url=None, **kwargs): ) +def language_to_azure_language(language: Language) -> str | None: + match language: + case Language.BG: + return "bg-BG" + case Language.CA: + return "ca-ES" + case Language.ZH: + return "zh-CN" + case Language.ZH_TW: + return "zh-TW" + case Language.CS: + return "cs-CZ" + case Language.DA: + return "da-DK" + case Language.NL: + return "nl-NL" + case Language.EN: + return "en-US" + case Language.EN_US: + return "en-US" + case Language.EN_AU: + return "en-AU" + case Language.EN_GB: + return "en-GB" + case Language.EN_NZ: + return "en-NZ" + case Language.EN_IN: + return "en-IN" + case Language.ET: + return "et-EE" + case Language.FI: + return "fi-FI" + case Language.NL_BE: + return "nl-BE" + case Language.FR: + return "fr-FR" + case Language.FR_CA: + return "fr-CA" + case Language.DE: + return "de-DE" + case Language.DE_CH: + return "de-CH" + case Language.EL: + return "el-GR" + case Language.HI: + return "hi-IN" + case Language.HU: + return "hu-HU" + case Language.ID: + return "id-ID" + case Language.IT: + return "it-IT" + case Language.JA: + return "ja-JP" + case Language.KO: + return "ko-KR" + case Language.LV: + return "lv-LV" + case Language.LT: + return "lt-LT" + case Language.MS: + return "ms-MY" + case Language.NO: + return "nb-NO" + case Language.PL: + return "pl-PL" + case Language.PT: + return "pt-PT" + case Language.PT_BR: + return "pt-BR" + case Language.RO: + return "ro-RO" + case Language.RU: + return "ru-RU" + case Language.SK: + return "sk-SK" + case Language.ES: + return "es-ES" + case Language.SV: + return "sv-SE" + case Language.TH: + return "th-TH" + case Language.TR: + return "tr-TR" + case Language.UK: + return "uk-UA" + case Language.VI: + return "vi-VN" + return None + + class AzureTTSService(TTSService): class InputParams(BaseModel): emphasis: Optional[str] = None - language: Optional[str] = "en-US" + language: Optional[Language] = Language.EN pitch: Optional[str] = None rate: Optional[str] = "1.05" role: Optional[str] = None @@ -99,7 +191,7 @@ def __init__( self._settings = { "sample_rate": sample_rate, "emphasis": params.emphasis, - "language": params.language, + "language": language_to_azure_language(params.language) if params.language else "en-US", "pitch": params.pitch, "rate": params.rate, "role": params.role, diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 0817879c4..22e09cb2b 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -106,7 +106,7 @@ def __init__( "encoding": params.encoding, "sample_rate": params.sample_rate, }, - "language": language_to_cartesia_language(params.language) if params.language else None, + "language": language_to_cartesia_language(params.language) if params.language else "en", "speed": params.speed, "emotion": params.emotion, } @@ -280,7 +280,7 @@ class InputParams(BaseModel): encoding: Optional[str] = "pcm_s16le" sample_rate: Optional[int] = 16000 container: Optional[str] = "raw" - language: Optional[str] = "en" + language: Optional[Language] = Language.EN speed: Optional[Union[str, float]] = "" emotion: Optional[List[str]] = [] @@ -303,7 +303,7 @@ def __init__( "encoding": params.encoding, "sample_rate": params.sample_rate, }, - "language": params.language, + "language": language_to_cartesia_language(params.language) if params.language else None, "speed": params.speed, "emotion": params.emotion, } diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index 40fe0168d..55a3ba68e 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -120,7 +120,7 @@ def __init__( url: str = "", live_options: LiveOptions = LiveOptions( encoding="linear16", - language="en-US", + language=Language.EN, model="nova-2-conversationalai", sample_rate=16000, channels=1, diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py index 871b3eec6..695988b03 100644 --- a/src/pipecat/services/elevenlabs.py +++ b/src/pipecat/services/elevenlabs.py @@ -50,6 +50,76 @@ def sample_rate_from_output_format(output_format: str) -> int: return 16000 +def language_to_elevenlabs_language(language: Language) -> str | None: + match language: + case Language.BG: + return "bg" + case Language.ZH: + return "zh" + case Language.CS: + return "cs" + case Language.DA: + return "da" + case Language.NL: + return "nl" + case ( + Language.EN + | Language.EN_US + | Language.EN_AU + | Language.EN_GB + | Language.EN_NZ + | Language.EN_IN + ): + return "en" + case Language.FI: + return "fi" + case Language.FR | Language.FR_CA: + return "fr" + case Language.DE | Language.DE_CH: + return "de" + case Language.EL: + return "el" + case Language.HI: + return "hi" + case Language.HU: + return "hu" + case Language.ID: + return "id" + case Language.IT: + return "it" + case Language.JA: + return "ja" + case Language.KO: + return "ko" + case Language.MS: + return "ms" + case Language.NO: + return "no" + case Language.PL: + return "pl" + case Language.PT: + return "pt-PT" + case Language.PT_BR: + return "pt-BR" + case Language.RO: + return "ro" + case Language.RU: + return "ru" + case Language.SK: + return "sk" + case Language.ES: + return "es" + case Language.SV: + return "sv" + case Language.TR: + return "tr" + case Language.UK: + return "uk" + case Language.VI: + return "vi" + return None + + def calculate_word_times( alignment_info: Mapping[str, Any], cumulative_time: float ) -> List[Tuple[str, float]]: @@ -128,7 +198,9 @@ def __init__( self._url = url self._settings = { "sample_rate": sample_rate_from_output_format(params.output_format), - "language": params.language, + "language": language_to_elevenlabs_language(params.language) + if params.language + else "en", "output_format": params.output_format, "optimize_streaming_latency": params.optimize_streaming_latency, "stability": params.stability, diff --git a/src/pipecat/services/gladia.py b/src/pipecat/services/gladia.py index 16f3dab97..0938a1cee 100644 --- a/src/pipecat/services/gladia.py +++ b/src/pipecat/services/gladia.py @@ -20,6 +20,7 @@ TranscriptionFrame, ) from pipecat.services.ai_services import STTService +from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 # See .env.example for Gladia configuration needed @@ -33,10 +34,88 @@ raise Exception(f"Missing module: {e}") +def language_to_gladia_language(language: Language) -> str | None: + match language: + case Language.BG: + return "bg" + case Language.CA: + return "ca" + case Language.ZH: + return "zh" + case Language.CS: + return "cs" + case Language.DA: + return "da" + case Language.NL: + return "nl" + case ( + Language.EN + | Language.EN_US + | Language.EN_AU + | Language.EN_GB + | Language.EN_NZ + | Language.EN_IN + ): + return "en" + case Language.ET: + return "et" + case Language.FI: + return "fi" + case Language.FR | Language.FR_CA: + return "fr" + case Language.DE | Language.DE_CH: + return "de" + case Language.EL: + return "el" + case Language.HI: + return "hi" + case Language.HU: + return "hu" + case Language.ID: + return "id" + case Language.IT: + return "it" + case Language.JA: + return "ja" + case Language.KO: + return "ko" + case Language.LV: + return "lv" + case Language.LT: + return "lt" + case Language.MS: + return "ms" + case Language.NO: + return "no" + case Language.PL: + return "pl" + case Language.PT | Language.PT_BR: + return "pt" + case Language.RO: + return "ro" + case Language.RU: + return "ru" + case Language.SK: + return "sk" + case Language.ES: + return "es" + case Language.SV: + return "sv" + case Language.TH: + return "th" + case Language.TR: + return "tr" + case Language.UK: + return "uk" + case Language.VI: + return "vi" + return None + + class GladiaSTTService(STTService): class InputParams(BaseModel): sample_rate: Optional[int] = 16000 - language: Optional[str] = "english" + language: Optional[Language] = Language.EN transcription_hint: Optional[str] = None endpointing: Optional[int] = 200 prosody: Optional[bool] = None @@ -56,7 +135,7 @@ def __init__( self._url = url self._settings = { "sample_rate": params.sample_rate, - "language": params.language, + "language": language_to_gladia_language(params.language) if params.language else "en", "transcription_hint": params.transcription_hint, "endpointing": params.endpointing, "prosody": params.prosody, diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index 05fff2056..55a2576d2 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -30,6 +30,7 @@ ) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import LLMService, TTSService +from pipecat.transcriptions.language import Language try: import google.ai.generativelanguage as glm @@ -145,13 +146,100 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): await self._process_context(context) +def language_to_google_language(language: Language) -> str | None: + match language: + case Language.BG: + return "bg-BG" + case Language.CA: + return "ca-ES" + case Language.ZH: + return "cmn-CN" + case Language.ZH_TW: + return "cmn-TW" + case Language.CS: + return "cs-CZ" + case Language.DA: + return "da-DK" + case Language.NL: + return "nl-NL" + case Language.EN: + return "en-US" + case Language.EN_US: + return "en-US" + case Language.EN_AU: + return "en-AU" + case Language.EN_GB: + return "en-GB" + case Language.EN_IN: + return "en-IN" + case Language.ET: + return "et-EE" + case Language.FI: + return "fi-FI" + case Language.NL_BE: + return "nl-BE" + case Language.FR: + return "fr-FR" + case Language.FR_CA: + return "fr-CA" + case Language.DE: + return "de-DE" + case Language.EL: + return "el-GR" + case Language.HI: + return "hi-IN" + case Language.HU: + return "hu-HU" + case Language.ID: + return "id-ID" + case Language.IT: + return "it-IT" + case Language.JA: + return "ja-JP" + case Language.KO: + return "ko-KR" + case Language.LV: + return "lv-LV" + case Language.LT: + return "lt-LT" + case Language.MS: + return "ms-MY" + case Language.NO: + return "nb-NO" + case Language.PL: + return "pl-PL" + case Language.PT: + return "pt-PT" + case Language.PT_BR: + return "pt-BR" + case Language.RO: + return "ro-RO" + case Language.RU: + return "ru-RU" + case Language.SK: + return "sk-SK" + case Language.ES: + return "es-ES" + case Language.SV: + return "sv-SE" + case Language.TH: + return "th-TH" + case Language.TR: + return "tr-TR" + case Language.UK: + return "uk-UA" + case Language.VI: + return "vi-VN" + return None + + class GoogleTTSService(TTSService): class InputParams(BaseModel): pitch: Optional[str] = None rate: Optional[str] = None volume: Optional[str] = None emphasis: Optional[Literal["strong", "moderate", "reduced", "none"]] = None - language: Optional[str] = "en-US" + language: Optional[Language] = Language.EN gender: Optional[Literal["male", "female", "neutral"]] = None google_style: Optional[Literal["apologetic", "calm", "empathetic", "firm", "lively"]] = None @@ -173,7 +261,9 @@ def __init__( "rate": params.rate, "volume": params.volume, "emphasis": params.emphasis, - "language": params.language, + "language": language_to_google_language(params.language) + if params.language + else "en-US", "gender": params.gender, "google_style": params.google_style, } diff --git a/src/pipecat/services/lmnt.py b/src/pipecat/services/lmnt.py index c828e7a7a..1ea3f1e62 100644 --- a/src/pipecat/services/lmnt.py +++ b/src/pipecat/services/lmnt.py @@ -22,6 +22,7 @@ ) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import TTSService +from pipecat.transcriptions.language import Language # See .env.example for LMNT configuration needed try: @@ -34,6 +35,32 @@ raise Exception(f"Missing module: {e}") +def language_to_lmnt_language(language: Language) -> str | None: + match language: + case Language.DE: + return "de" + case ( + Language.EN + | Language.EN_US + | Language.EN_AU + | Language.EN_GB + | Language.EN_NZ + | Language.EN_IN + ): + return "en" + case Language.ES: + return "es" + case Language.FR | Language.FR_CA: + return "fr" + case Language.PT | Language.PT_BR: + return "pt" + case Language.ZH | Language.ZH_TW: + return "zh" + case Language.KO: + return "ko" + return None + + class LmntTTSService(TTSService): def __init__( self, @@ -41,7 +68,7 @@ def __init__( api_key: str, voice_id: str, sample_rate: int = 24000, - language: str = "en", + language: Language = Language.EN, **kwargs, ): # Let TTSService produce TTSStoppedFrames after a short delay of @@ -55,7 +82,7 @@ def __init__( "encoding": "pcm_s16le", "sample_rate": sample_rate, }, - "language": language, + "language": language_to_lmnt_language(language) if language else "en", } self.set_voice(voice_id) diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py index 7826cfcd8..eb20d5f3c 100644 --- a/src/pipecat/services/xtts.py +++ b/src/pipecat/services/xtts.py @@ -19,6 +19,7 @@ TTSStoppedFrame, ) from pipecat.services.ai_services import TTSService +from pipecat.transcriptions.language import Language try: import resampy @@ -36,12 +37,56 @@ # https://github.com/coqui-ai/xtts-streaming-server +def language_to_xtts_language(language: Language) -> str | None: + match language: + case Language.CS: + return "cs" + case Language.DE: + return "de" + case ( + Language.EN + | Language.EN_US + | Language.EN_AU + | Language.EN_GB + | Language.EN_NZ + | Language.EN_IN + ): + return "en" + case Language.ES: + return "es" + case Language.FR: + return "fr" + case Language.HI: + return "hi" + case Language.HU: + return "hu" + case Language.IT: + return "it" + case Language.JA: + return "ja" + case Language.KO: + return "ko" + case Language.NL: + return "nl" + case Language.PL: + return "pl" + case Language.PT | Language.PT_BR: + return "pt" + case Language.RU: + return "ru" + case Language.TR: + return "tr" + case Language.ZH: + return "zh-cn" + return None + + class XTTSService(TTSService): def __init__( self, *, voice_id: str, - language: str, + language: Language, base_url: str, aiohttp_session: aiohttp.ClientSession, **kwargs, @@ -49,7 +94,7 @@ def __init__( super().__init__(**kwargs) self._settings = { - "language": language, + "language": language_to_xtts_language(language) if language else "en", "base_url": base_url, } self.set_voice(voice_id) From 3d642df2b049eb5f57fc4909f649b96fbef0b43f Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 2 Oct 2024 11:07:48 -0400 Subject: [PATCH 5/5] Revert aligning voice_id name in TTS service constructor --- examples/foundational/07c-interruptible-deepgram.py | 2 +- examples/foundational/07e-interruptible-playht.py | 2 +- examples/foundational/07g-interruptible-openai-tts.py | 2 +- examples/foundational/16-gpu-container-local-bot.py | 2 +- src/pipecat/services/azure.py | 4 ++-- src/pipecat/services/deepgram.py | 4 ++-- src/pipecat/services/openai.py | 4 ++-- src/pipecat/services/playht.py | 4 ++-- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/foundational/07c-interruptible-deepgram.py b/examples/foundational/07c-interruptible-deepgram.py index f3b4ee246..fc33c246f 100644 --- a/examples/foundational/07c-interruptible-deepgram.py +++ b/examples/foundational/07c-interruptible-deepgram.py @@ -50,7 +50,7 @@ async def main(): stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) - tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice_id="aura-helios-en") + tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en") llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") diff --git a/examples/foundational/07e-interruptible-playht.py b/examples/foundational/07e-interruptible-playht.py index 58d85ed79..b05c0d9fd 100644 --- a/examples/foundational/07e-interruptible-playht.py +++ b/examples/foundational/07e-interruptible-playht.py @@ -52,7 +52,7 @@ async def main(): tts = PlayHTTTSService( user_id=os.getenv("PLAYHT_USER_ID"), api_key=os.getenv("PLAYHT_API_KEY"), - voice_id="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json", + voice_url="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json", ) llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") diff --git a/examples/foundational/07g-interruptible-openai-tts.py b/examples/foundational/07g-interruptible-openai-tts.py index b7671c42f..cabf1245e 100644 --- a/examples/foundational/07g-interruptible-openai-tts.py +++ b/examples/foundational/07g-interruptible-openai-tts.py @@ -48,7 +48,7 @@ async def main(): ), ) - tts = OpenAITTSService(api_key=os.getenv("OPENAI_API_KEY"), voice_id="alloy") + tts = OpenAITTSService(api_key=os.getenv("OPENAI_API_KEY"), voice="alloy") llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") diff --git a/examples/foundational/16-gpu-container-local-bot.py b/examples/foundational/16-gpu-container-local-bot.py index b3ae9686e..55286eed5 100644 --- a/examples/foundational/16-gpu-container-local-bot.py +++ b/examples/foundational/16-gpu-container-local-bot.py @@ -55,7 +55,7 @@ async def main(): tts = DeepgramTTSService( aiohttp_session=session, api_key=os.getenv("DEEPGRAM_API_KEY"), - voice_id="aura-asteria-en", + voice="aura-asteria-en", base_url="http://0.0.0.0:8080/v1/speak", ) diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index 078b7df6b..32af1b3f4 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -178,7 +178,7 @@ def __init__( *, api_key: str, region: str, - voice_id="en-US-SaraNeural", + voice="en-US-SaraNeural", sample_rate: int = 16000, params: InputParams = InputParams(), **kwargs, @@ -200,7 +200,7 @@ def __init__( "volume": params.volume, } - self.set_voice(voice_id) + self.set_voice(voice) def can_generate_metrics(self) -> bool: return True diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index 55a3ba68e..433e08172 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -49,7 +49,7 @@ def __init__( self, *, api_key: str, - voice_id: str = "aura-helios-en", + voice: str = "aura-helios-en", sample_rate: int = 16000, encoding: str = "linear16", **kwargs, @@ -60,7 +60,7 @@ def __init__( "sample_rate": sample_rate, "encoding": encoding, } - self.set_voice(voice_id) + self.set_voice(voice) self._deepgram_client = DeepgramClient(api_key=api_key) def can_generate_metrics(self) -> bool: diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index 68b7fa4de..24d673234 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -381,7 +381,7 @@ def __init__( self, *, api_key: str | None = None, - voice_id: str = "alloy", + voice: str = "alloy", model: Literal["tts-1", "tts-1-hd"] = "tts-1", sample_rate: int = 24000, **kwargs, @@ -392,7 +392,7 @@ def __init__( "sample_rate": sample_rate, } self.set_model_name(model) - self.set_voice(voice_id) + self.set_voice(voice) self._client = AsyncOpenAI(api_key=api_key) diff --git a/src/pipecat/services/playht.py b/src/pipecat/services/playht.py index aea5d8d92..fe5feba3a 100644 --- a/src/pipecat/services/playht.py +++ b/src/pipecat/services/playht.py @@ -32,7 +32,7 @@ class PlayHTTTSService(TTSService): def __init__( - self, *, api_key: str, user_id: str, voice_id: str, sample_rate: int = 16000, **kwargs + self, *, api_key: str, user_id: str, voice_url: str, sample_rate: int = 16000, **kwargs ): super().__init__(sample_rate=sample_rate, **kwargs) @@ -49,7 +49,7 @@ def __init__( "format": Format.FORMAT_WAV, "voice_engine": "PlayHT2.0-turbo", } - self.set_voice(voice_id) + self.set_voice(voice_url) self._options = TTSOptions( voice=self._voice_id, sample_rate=self._settings["sample_rate"],