Skip to content

Commit

Permalink
Refactor communicate for better readability
Browse files Browse the repository at this point in the history
Also improve performance on larger documents.

Signed-off-by: rany2 <[email protected]>
  • Loading branch information
rany2 committed Feb 16, 2024
1 parent df6bac8 commit 9e96847
Showing 1 changed file with 111 additions and 136 deletions.
247 changes: 111 additions & 136 deletions src/edge_tts/communicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,68 +304,40 @@ def __init__(
async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
"""Streams audio and metadata from the service."""

texts = split_text_by_byte_length(
escape(remove_incompatible_characters(self.text)),
calc_max_mesg_size(self.voice, self.rate, self.volume, self.pitch),
)
final_utterance: Dict[int, int] = {}
prev_idx = -1
shift_time = -1
async def send_request(websocket: aiohttp.ClientWebSocketResponse) -> None:
"""Sends the request to the service."""

# Each message needs to have the proper date.
date = date_to_string()

# Prepare the request to be sent to the service.
#
# Note sentenceBoundaryEnabled and wordBoundaryEnabled are actually supposed
# to be booleans, but Edge Browser seems to send them as strings.
#
# This is a bug in Edge as Azure Cognitive Services actually sends them as
# bool and not string. For now I will send them as bool unless it causes
# any problems.
#
# Also pay close attention to double { } in request (escape for f-string).
await websocket.send_str(
f"X-Timestamp:{date}\r\n"
"Content-Type:application/json; charset=utf-8\r\n"
"Path:speech.config\r\n\r\n"
'{"context":{"synthesis":{"audio":{"metadataoptions":{'
'"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},'
'"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
"}}}}\r\n"
)

ssl_ctx = ssl.create_default_context(cafile=certifi.where())
for idx, text in enumerate(texts):
async with aiohttp.ClientSession(
trust_env=True,
) as session, session.ws_connect(
f"{WSS_URL}&ConnectionId={connect_id()}",
compress=15,
autoclose=True,
autoping=True,
proxy=self.proxy,
headers={
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
" (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
},
ssl=ssl_ctx,
) as websocket:
# download indicates whether we should be expecting audio data,
# this is so what we avoid getting binary data from the websocket
# and falsely thinking it's audio data.
download_audio = False

# audio_was_received indicates whether we have received audio data
# from the websocket. This is so we can raise an exception if we
# don't receive any audio data.
audio_was_received = False

# Each message needs to have the proper date.
date = date_to_string()

# Prepare the request to be sent to the service.
#
# Note sentenceBoundaryEnabled and wordBoundaryEnabled are actually supposed
# to be booleans, but Edge Browser seems to send them as strings.
#
# This is a bug in Edge as Azure Cognitive Services actually sends them as
# bool and not string. For now I will send them as bool unless it causes
# any problems.
#
# Also pay close attention to double { } in request (escape for f-string).
await websocket.send_str(
f"X-Timestamp:{date}\r\n"
"Content-Type:application/json; charset=utf-8\r\n"
"Path:speech.config\r\n\r\n"
'{"context":{"synthesis":{"audio":{"metadataoptions":{'
'"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},'
'"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
"}}}}\r\n"
)
# Split the text into multiple strings if it is too long for the service.
texts = split_text_by_byte_length(
escape(remove_incompatible_characters(self.text)),
calc_max_mesg_size(self.voice, self.rate, self.volume, self.pitch),
)

# Send the request to the service.
for text in texts:
await websocket.send_str(
ssml_headers_plus_data(
connect_id(),
Expand All @@ -374,86 +346,89 @@ async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
)
)

async for received in websocket:
if received.type == aiohttp.WSMsgType.TEXT:
parameters, data = get_headers_and_data(received.data)
path = parameters.get(b"Path")
if path == b"turn.start":
download_audio = True
elif path == b"turn.end":
download_audio = False
break # End of audio data
elif path == b"audio.metadata":
for meta_obj in json.loads(data)["Metadata"]:
meta_type = meta_obj["Type"]
if idx != prev_idx:
shift_time = sum(
final_utterance[i] for i in range(idx)
)
prev_idx = idx
if meta_type == "WordBoundary":
final_utterance[idx] = (
meta_obj["Data"]["Offset"]
+ meta_obj["Data"]["Duration"]
# Average padding added by the service
# Alternatively we could use ffmpeg to get value properly
# but I don't want to add an additional dependency
# if this is found to work well enough.
+ 8_750_000
)
yield {
"type": meta_type,
"offset": meta_obj["Data"]["Offset"]
+ shift_time,
"duration": meta_obj["Data"]["Duration"],
"text": meta_obj["Data"]["text"]["Text"],
}
elif meta_type == "SessionEnd":
continue
else:
raise UnknownResponse(
f"Unknown metadata type: {meta_type}"
)
elif path == b"response":
pass
else:
raise UnknownResponse(
"The response from the service is not recognized.\n"
+ received.data
)
elif received.type == aiohttp.WSMsgType.BINARY:
if not download_audio:
raise UnexpectedResponse(
"We received a binary message, but we are not expecting one."
)

if len(received.data) < 2:
raise UnexpectedResponse(
"We received a binary message, but it is missing the header length."
)

# See: https://github.com/microsoft/cognitive-services-speech-sdk-js/blob/d071d11/src/common.speech/WebsocketMessageFormatter.ts#L46
header_length = int.from_bytes(received.data[:2], "big")
if len(received.data) < header_length + 2:
raise UnexpectedResponse(
"We received a binary message, but it is missing the audio data."
)

yield {
"type": "audio",
"data": received.data[header_length + 2 :],
}
audio_was_received = True
elif received.type == aiohttp.WSMsgType.ERROR:
raise WebSocketError(
received.data if received.data else "Unknown error"
ssl_ctx = ssl.create_default_context(cafile=certifi.where())
async with aiohttp.ClientSession(
trust_env=True,
) as session, session.ws_connect(
f"{WSS_URL}&ConnectionId={connect_id()}",
compress=15,
autoclose=True,
autoping=True,
proxy=self.proxy,
headers={
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
" (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
},
ssl=ssl_ctx,
) as websocket:
# audio_was_received indicates whether we have received audio data
# from the websocket. This is so we can raise an exception if we
# don't receive any audio data.
audio_was_received = False

# Send the request to the service.
await send_request(websocket)

async for received in websocket:
if received.type == aiohttp.WSMsgType.TEXT:
parameters, data = get_headers_and_data(received.data)
path = parameters.get(b"Path")
if path == b"audio.metadata":
for meta_obj in json.loads(data)["Metadata"]:
meta_type = meta_obj["Type"]
if meta_type == "WordBoundary":
yield {
"type": meta_type,
"offset": meta_obj["Data"]["Offset"],
"duration": meta_obj["Data"]["Duration"],
"text": meta_obj["Data"]["text"]["Text"],
}
elif meta_type in ("SessionEnd",):
continue
else:
raise UnknownResponse(
f"Unknown metadata type: {meta_type}"
)
elif path in (b"response", b"turn.start", b"turn.end"):
pass
else:
raise UnknownResponse(
"The response from the service is not recognized.\n"
+ received.data
)
elif received.type == aiohttp.WSMsgType.BINARY:
if len(received.data) < 2:
raise UnexpectedResponse(
"We received a binary message, but it is missing the header length."
)

# See: https://github.com/microsoft/cognitive-services-speech-sdk-js/blob/d071d11/src/common.speech/WebsocketMessageFormatter.ts#L46
header_length = int.from_bytes(received.data[:2], "big")
if len(received.data) < header_length + 2:
raise UnexpectedResponse(
"We received a binary message, but it is missing the audio data."
)

if not audio_was_received:
raise NoAudioReceived(
"No audio was received. Please verify that your parameters are correct."
audio_was_received = True
yield {
"type": "audio",
"data": received.data[header_length + 2 :],
}
elif received.type == aiohttp.WSMsgType.ERROR:
raise WebSocketError(
received.data if received.data else "Unknown error"
)

if not audio_was_received:
raise NoAudioReceived(
"No audio was received. Please verify that your parameters are correct."
)

async def save(
self,
audio_fname: Union[str, bytes],
Expand Down

0 comments on commit 9e96847

Please sign in to comment.