From 5a83f8cae9aaa427c1b6ad1f1e5594f6d1dbde16 Mon Sep 17 00:00:00 2001 From: duyalei <> Date: Tue, 24 Sep 2024 11:50:17 +0800 Subject: [PATCH] add playback_rate in AzureTTSService --- src/pipecat/services/azure.py | 36 ++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index 9177f9e4e..b70aca632 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -42,7 +42,10 @@ CancellationReason, languageconfig, ) - from azure.cognitiveservices.speech.audio import AudioStreamFormat, PushAudioInputStream + from azure.cognitiveservices.speech.audio import ( + AudioStreamFormat, + PushAudioInputStream, + ) from azure.cognitiveservices.speech.dialog import AudioConfig except ModuleNotFoundError as e: logger.error(f"Exception: {e}") @@ -54,7 +57,12 @@ class AzureLLMService(BaseOpenAILLMService): def __init__( - self, *, api_key: str, endpoint: str, model: str, api_version: str = "2023-12-01-preview" + self, + *, + api_key: str, + endpoint: str, + model: str, + api_version: str = "2023-12-01-preview", ): # Initialize variables before calling parent __init__() because that # will call create_client() and we need those values there. @@ -78,6 +86,7 @@ def __init__( region: str, voice="en-US-SaraNeural", sample_rate: int = 16000, + playback_rate: float = 1.0, **kwargs, ): super().__init__(sample_rate=sample_rate, **kwargs) @@ -87,6 +96,7 @@ def __init__( self._voice = voice self._sample_rate = sample_rate + self._playback_rate = playback_rate def can_generate_metrics(self) -> bool: return True @@ -106,7 +116,7 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: f"" "" "" - "" + f"" f"{text}" " " ) @@ -119,7 +129,9 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: await self.push_frame(TTSStartedFrame()) # Azure always sends a 44-byte header. Strip it off. yield TTSAudioRawFrame( - audio=result.audio_data[44:], sample_rate=self._sample_rate, num_channels=1 + audio=result.audio_data[44:], + sample_rate=self._sample_rate, + num_channels=1, ) await self.push_frame(TTSStoppedFrame()) elif result.reason == ResultReason.Canceled: @@ -154,12 +166,15 @@ def __init__( speech_config=speech_config, audio_config=audio_config, auto_detect_source_language_config=languageconfig.AutoDetectSourceLanguageConfig( - languages=language) + languages=language + ), ) else: - self._speech_recognizer = SpeechRecognizer(speech_config=speech_config, - audio_config=audio_config, - language=language) + self._speech_recognizer = SpeechRecognizer( + speech_config=speech_config, + audio_config=audio_config, + language=language, + ) self._speech_recognizer.recognized.connect(self._on_handle_recognized) async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: @@ -253,6 +268,9 @@ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]: image_stream = io.BytesIO(await response.content.read()) image = Image.open(image_stream) frame = URLImageRawFrame( - url=image_url, image=image.tobytes(), size=image.size, format=image.format + url=image_url, + image=image.tobytes(), + size=image.size, + format=image.format, ) yield frame