Add input params to Azure TTS

pipecat-ai · Sep 23, 2024 · 8edee81 · 8edee81
1 parent c262b27
commit 8edee81
Showing 1 changed file with 76 additions and 39 deletions.
diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py
@@ -4,45 +4,34 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-import aiohttp
 import asyncio
 import io
+from typing import AsyncGenerator, Optional
 
+import aiohttp
+from loguru import logger
 from PIL import Image
-from typing import AsyncGenerator
-
-from pipecat.frames.frames import (
-    CancelFrame,
-    EndFrame,
-    ErrorFrame,
-    Frame,
-    StartFrame,
-    TTSAudioRawFrame,
-    TTSStartedFrame,
-    TTSStoppedFrame,
-    TranscriptionFrame,
-    URLImageRawFrame,
-)
-from pipecat.metrics.metrics import TTSUsageMetricsData
-from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.ai_services import STTService, TTSService, ImageGenService
+from pydantic import BaseModel
+
+from pipecat.frames.frames import (CancelFrame, EndFrame, ErrorFrame, Frame,
+                                   StartFrame, TranscriptionFrame,
+                                   TTSAudioRawFrame, TTSStartedFrame,
+                                   TTSStoppedFrame, URLImageRawFrame)
+from pipecat.services.ai_services import (ImageGenService, STTService,
+                                          TTSService)
 from pipecat.services.openai import BaseOpenAILLMService
 from pipecat.utils.time import time_now_iso8601
 
-from loguru import logger
-
 # See .env.example for Azure configuration needed
 try:
-    from openai import AsyncAzureOpenAI
-    from azure.cognitiveservices.speech import (
-        SpeechConfig,
-        SpeechRecognizer,
-        SpeechSynthesizer,
-        ResultReason,
-        CancellationReason,
-    )
-    from azure.cognitiveservices.speech.audio import AudioStreamFormat, PushAudioInputStream
+    from azure.cognitiveservices.speech import (CancellationReason,
+                                                ResultReason, SpeechConfig,
+                                                SpeechRecognizer,
+                                                SpeechSynthesizer)
+    from azure.cognitiveservices.speech.audio import (AudioStreamFormat,
+                                                      PushAudioInputStream)
     from azure.cognitiveservices.speech.dialog import AudioConfig
+    from openai import AsyncAzureOpenAI
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
     logger.error(
@@ -70,13 +59,25 @@ def create_client(self, api_key=None, base_url=None, **kwargs):
 
 
 class AzureTTSService(TTSService):
+    class InputParams(BaseModel):
+        emphasis: Optional[str] = None
+        language_code: Optional[str] = "en-US"
+        pitch: Optional[str] = None
+        rate: Optional[str] = "1.05"
+        role: Optional[str] = None
+        style: Optional[str] = None
+        style_degree: Optional[str] = None
+        volume: Optional[str] = None
+
+
     def __init__(
         self,
         *,
         api_key: str,
         region: str,
         voice="en-US-SaraNeural",
         sample_rate: int = 16000,
+        params: InputParams = InputParams(),
         **kwargs,
     ):
         super().__init__(sample_rate=sample_rate, **kwargs)
@@ -86,10 +87,55 @@ def __init__(
 
         self._voice = voice
         self._sample_rate = sample_rate
+        self._params = params
 
     def can_generate_metrics(self) -> bool:
         return True
 
+    def _construct_ssml(self, text: str) -> str:
+        ssml = (
+            f"<speak version='1.0' xml:lang='{self._params.language_code}' "
+            "xmlns='http://www.w3.org/2001/10/synthesis' "
+            "xmlns:mstts='http://www.w3.org/2001/mstts'>"
+            f"<voice name='{self._voice}'>"
+            "<mstts:silence type='Sentenceboundary' value='20ms' />"
+        )
+
+        if self._params.style:
+            ssml += f"<mstts:express-as style='{self._params.style}'"
+            if self._params.style_degree:
+                ssml += f" styledegree='{self._params.style_degree}'"
+            if self._params.role:
+                ssml += f" role='{self._params.role}'"
+            ssml += ">"
+
+        prosody_attrs = []
+        if self._params.rate:
+            prosody_attrs.append(f"rate='{self._params.rate}'")
+        if self._params.pitch:
+            prosody_attrs.append(f"pitch='{self._params.pitch}'")
+        if self._params.volume:
+            prosody_attrs.append(f"volume='{self._params.volume}'")
+
+        ssml += f"<prosody {' '.join(prosody_attrs)}>"
+
+        if self._params.emphasis:
+            ssml += f"<emphasis level='{self._params.emphasis}'>"
+
+        ssml += text
+
+        if self._params.emphasis:
+            ssml += "</emphasis>"
+
+        ssml += "</prosody>"
+
+        if self._params.style:
+            ssml += "</mstts:express-as>"
+
+        ssml += "</voice></speak>"
+
+        return ssml
+
     async def set_voice(self, voice: str):
         logger.debug(f"Switching TTS voice to: [{voice}]")
         self._voice = voice
@@ -99,16 +145,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
 
         await self.start_ttfb_metrics()
 
-        ssml = (
-            "<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' "
-            "xmlns:mstts='http://www.w3.org/2001/mstts'>"
-            f"<voice name='{self._voice}'>"
-            "<mstts:silence type='Sentenceboundary' value='20ms' />"
-            "<mstts:express-as style='lyrical' styledegree='2' role='SeniorFemale'>"
-            "<prosody rate='1.05'>"
-            f"{text}"
-            "</prosody></mstts:express-as></voice></speak> "
-        )
+        ssml = self._construct_ssml(text)
 
         result = await asyncio.to_thread(self._speech_synthesizer.speak_ssml, (ssml))