diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index 24e73cd2a..efb6a6fe4 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -4,45 +4,34 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import aiohttp import asyncio import io +from typing import AsyncGenerator, Optional +import aiohttp +from loguru import logger from PIL import Image -from typing import AsyncGenerator - -from pipecat.frames.frames import ( - CancelFrame, - EndFrame, - ErrorFrame, - Frame, - StartFrame, - TTSAudioRawFrame, - TTSStartedFrame, - TTSStoppedFrame, - TranscriptionFrame, - URLImageRawFrame, -) -from pipecat.metrics.metrics import TTSUsageMetricsData -from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.ai_services import STTService, TTSService, ImageGenService +from pydantic import BaseModel + +from pipecat.frames.frames import (CancelFrame, EndFrame, ErrorFrame, Frame, + StartFrame, TranscriptionFrame, + TTSAudioRawFrame, TTSStartedFrame, + TTSStoppedFrame, URLImageRawFrame) +from pipecat.services.ai_services import (ImageGenService, STTService, + TTSService) from pipecat.services.openai import BaseOpenAILLMService from pipecat.utils.time import time_now_iso8601 -from loguru import logger - # See .env.example for Azure configuration needed try: - from openai import AsyncAzureOpenAI - from azure.cognitiveservices.speech import ( - SpeechConfig, - SpeechRecognizer, - SpeechSynthesizer, - ResultReason, - CancellationReason, - ) - from azure.cognitiveservices.speech.audio import AudioStreamFormat, PushAudioInputStream + from azure.cognitiveservices.speech import (CancellationReason, + ResultReason, SpeechConfig, + SpeechRecognizer, + SpeechSynthesizer) + from azure.cognitiveservices.speech.audio import (AudioStreamFormat, + PushAudioInputStream) from azure.cognitiveservices.speech.dialog import AudioConfig + from openai import AsyncAzureOpenAI except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( @@ -70,6 +59,17 @@ def create_client(self, api_key=None, base_url=None, **kwargs): class AzureTTSService(TTSService): + class InputParams(BaseModel): + emphasis: Optional[str] = None + language_code: Optional[str] = "en-US" + pitch: Optional[str] = None + rate: Optional[str] = "1.05" + role: Optional[str] = None + style: Optional[str] = None + style_degree: Optional[str] = None + volume: Optional[str] = None + + def __init__( self, *, @@ -77,6 +77,7 @@ def __init__( region: str, voice="en-US-SaraNeural", sample_rate: int = 16000, + params: InputParams = InputParams(), **kwargs, ): super().__init__(sample_rate=sample_rate, **kwargs) @@ -86,10 +87,55 @@ def __init__( self._voice = voice self._sample_rate = sample_rate + self._params = params def can_generate_metrics(self) -> bool: return True + def _construct_ssml(self, text: str) -> str: + ssml = ( + f"" + f"" + "" + ) + + if self._params.style: + ssml += f"" + + if self._params.emphasis: + ssml += f"" + + ssml += text + + if self._params.emphasis: + ssml += "" + + ssml += "" + + if self._params.style: + ssml += "" + + ssml += "" + + return ssml + async def set_voice(self, voice: str): logger.debug(f"Switching TTS voice to: [{voice}]") self._voice = voice @@ -99,16 +145,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.start_ttfb_metrics() - ssml = ( - "" - f"" - "" - "" - "" - f"{text}" - " " - ) + ssml = self._construct_ssml(text) result = await asyncio.to_thread(self._speech_synthesizer.speak_ssml, (ssml))