diff --git a/src/edge_tts/communicate.py b/src/edge_tts/communicate.py index 06cc145..4129a58 100644 --- a/src/edge_tts/communicate.py +++ b/src/edge_tts/communicate.py @@ -304,68 +304,40 @@ def __init__( async def stream(self) -> AsyncGenerator[Dict[str, Any], None]: """Streams audio and metadata from the service.""" - texts = split_text_by_byte_length( - escape(remove_incompatible_characters(self.text)), - calc_max_mesg_size(self.voice, self.rate, self.volume, self.pitch), - ) - final_utterance: Dict[int, int] = {} - prev_idx = -1 - shift_time = -1 + async def send_request(websocket: aiohttp.ClientWebSocketResponse) -> None: + """Sends the request to the service.""" + + # Each message needs to have the proper date. + date = date_to_string() + + # Prepare the request to be sent to the service. + # + # Note sentenceBoundaryEnabled and wordBoundaryEnabled are actually supposed + # to be booleans, but Edge Browser seems to send them as strings. + # + # This is a bug in Edge as Azure Cognitive Services actually sends them as + # bool and not string. For now I will send them as bool unless it causes + # any problems. + # + # Also pay close attention to double { } in request (escape for f-string). + await websocket.send_str( + f"X-Timestamp:{date}\r\n" + "Content-Type:application/json; charset=utf-8\r\n" + "Path:speech.config\r\n\r\n" + '{"context":{"synthesis":{"audio":{"metadataoptions":{' + '"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},' + '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"' + "}}}}\r\n" + ) - ssl_ctx = ssl.create_default_context(cafile=certifi.where()) - for idx, text in enumerate(texts): - async with aiohttp.ClientSession( - trust_env=True, - ) as session, session.ws_connect( - f"{WSS_URL}&ConnectionId={connect_id()}", - compress=15, - autoclose=True, - autoping=True, - proxy=self.proxy, - headers={ - "Pragma": "no-cache", - "Cache-Control": "no-cache", - "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold", - "Accept-Encoding": "gzip, deflate, br", - "Accept-Language": "en-US,en;q=0.9", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" - " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41", - }, - ssl=ssl_ctx, - ) as websocket: - # download indicates whether we should be expecting audio data, - # this is so what we avoid getting binary data from the websocket - # and falsely thinking it's audio data. - download_audio = False - - # audio_was_received indicates whether we have received audio data - # from the websocket. This is so we can raise an exception if we - # don't receive any audio data. - audio_was_received = False - - # Each message needs to have the proper date. - date = date_to_string() - - # Prepare the request to be sent to the service. - # - # Note sentenceBoundaryEnabled and wordBoundaryEnabled are actually supposed - # to be booleans, but Edge Browser seems to send them as strings. - # - # This is a bug in Edge as Azure Cognitive Services actually sends them as - # bool and not string. For now I will send them as bool unless it causes - # any problems. - # - # Also pay close attention to double { } in request (escape for f-string). - await websocket.send_str( - f"X-Timestamp:{date}\r\n" - "Content-Type:application/json; charset=utf-8\r\n" - "Path:speech.config\r\n\r\n" - '{"context":{"synthesis":{"audio":{"metadataoptions":{' - '"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},' - '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"' - "}}}}\r\n" - ) + # Split the text into multiple strings if it is too long for the service. + texts = split_text_by_byte_length( + escape(remove_incompatible_characters(self.text)), + calc_max_mesg_size(self.voice, self.rate, self.volume, self.pitch), + ) + # Send the request to the service. + for text in texts: await websocket.send_str( ssml_headers_plus_data( connect_id(), @@ -374,86 +346,89 @@ async def stream(self) -> AsyncGenerator[Dict[str, Any], None]: ) ) - async for received in websocket: - if received.type == aiohttp.WSMsgType.TEXT: - parameters, data = get_headers_and_data(received.data) - path = parameters.get(b"Path") - if path == b"turn.start": - download_audio = True - elif path == b"turn.end": - download_audio = False - break # End of audio data - elif path == b"audio.metadata": - for meta_obj in json.loads(data)["Metadata"]: - meta_type = meta_obj["Type"] - if idx != prev_idx: - shift_time = sum( - final_utterance[i] for i in range(idx) - ) - prev_idx = idx - if meta_type == "WordBoundary": - final_utterance[idx] = ( - meta_obj["Data"]["Offset"] - + meta_obj["Data"]["Duration"] - # Average padding added by the service - # Alternatively we could use ffmpeg to get value properly - # but I don't want to add an additional dependency - # if this is found to work well enough. - + 8_750_000 - ) - yield { - "type": meta_type, - "offset": meta_obj["Data"]["Offset"] - + shift_time, - "duration": meta_obj["Data"]["Duration"], - "text": meta_obj["Data"]["text"]["Text"], - } - elif meta_type == "SessionEnd": - continue - else: - raise UnknownResponse( - f"Unknown metadata type: {meta_type}" - ) - elif path == b"response": - pass - else: - raise UnknownResponse( - "The response from the service is not recognized.\n" - + received.data - ) - elif received.type == aiohttp.WSMsgType.BINARY: - if not download_audio: - raise UnexpectedResponse( - "We received a binary message, but we are not expecting one." - ) - - if len(received.data) < 2: - raise UnexpectedResponse( - "We received a binary message, but it is missing the header length." - ) - - # See: https://github.com/microsoft/cognitive-services-speech-sdk-js/blob/d071d11/src/common.speech/WebsocketMessageFormatter.ts#L46 - header_length = int.from_bytes(received.data[:2], "big") - if len(received.data) < header_length + 2: - raise UnexpectedResponse( - "We received a binary message, but it is missing the audio data." - ) - - yield { - "type": "audio", - "data": received.data[header_length + 2 :], - } - audio_was_received = True - elif received.type == aiohttp.WSMsgType.ERROR: - raise WebSocketError( - received.data if received.data else "Unknown error" + ssl_ctx = ssl.create_default_context(cafile=certifi.where()) + async with aiohttp.ClientSession( + trust_env=True, + ) as session, session.ws_connect( + f"{WSS_URL}&ConnectionId={connect_id()}", + compress=15, + autoclose=True, + autoping=True, + proxy=self.proxy, + headers={ + "Pragma": "no-cache", + "Cache-Control": "no-cache", + "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "en-US,en;q=0.9", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41", + }, + ssl=ssl_ctx, + ) as websocket: + # audio_was_received indicates whether we have received audio data + # from the websocket. This is so we can raise an exception if we + # don't receive any audio data. + audio_was_received = False + + # Send the request to the service. + await send_request(websocket) + + async for received in websocket: + if received.type == aiohttp.WSMsgType.TEXT: + parameters, data = get_headers_and_data(received.data) + path = parameters.get(b"Path") + if path == b"audio.metadata": + for meta_obj in json.loads(data)["Metadata"]: + meta_type = meta_obj["Type"] + if meta_type == "WordBoundary": + yield { + "type": meta_type, + "offset": meta_obj["Data"]["Offset"], + "duration": meta_obj["Data"]["Duration"], + "text": meta_obj["Data"]["text"]["Text"], + } + elif meta_type in ("SessionEnd",): + continue + else: + raise UnknownResponse( + f"Unknown metadata type: {meta_type}" + ) + elif path in (b"response", b"turn.start", b"turn.end"): + pass + else: + raise UnknownResponse( + "The response from the service is not recognized.\n" + + received.data + ) + elif received.type == aiohttp.WSMsgType.BINARY: + if len(received.data) < 2: + raise UnexpectedResponse( + "We received a binary message, but it is missing the header length." + ) + + # See: https://github.com/microsoft/cognitive-services-speech-sdk-js/blob/d071d11/src/common.speech/WebsocketMessageFormatter.ts#L46 + header_length = int.from_bytes(received.data[:2], "big") + if len(received.data) < header_length + 2: + raise UnexpectedResponse( + "We received a binary message, but it is missing the audio data." ) - if not audio_was_received: - raise NoAudioReceived( - "No audio was received. Please verify that your parameters are correct." + audio_was_received = header_length > 0 + yield { + "type": "audio", + "data": received.data[header_length + 2 :], + } + elif received.type == aiohttp.WSMsgType.ERROR: + raise WebSocketError( + received.data if received.data else "Unknown error" ) + if not audio_was_received: + raise NoAudioReceived( + "No audio was received. Please verify that your parameters are correct." + ) + async def save( self, audio_fname: Union[str, bytes],