Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add sentence boundary which is more friendly to Chinese users #346

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,4 @@ cython_debug/
# Edge-TTS specific ignores
*.mp3
*.srt
/.idea/
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ async def amain() -> None:
async for chunk in communicate.stream():
if chunk["type"] == "audio":
file.write(chunk["data"])
elif chunk["type"] == "WordBoundary":
elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
submaker.feed(chunk)

with open(SRT_FILE, "w", encoding="utf-8") as file:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def main() -> None:
for chunk in communicate.stream_sync():
if chunk["type"] == "audio":
file.write(chunk["data"])
elif chunk["type"] == "WordBoundary":
elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
submaker.feed(chunk)

with open(SRT_FILE, "w", encoding="utf-8") as file:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env python3

"""Sync variant of the async .stream() method to
get audio chunks and feed them to SubMaker to
generate subtitles"""
import sys

import edge_tts

TEXT = """君不见,黄河之水天上来,奔流到海不复回。
君不见,高堂明镜悲白发,朝如青丝暮成雪。
人生得意须尽欢,莫使金樽空对月。
天生我材必有用,千金散尽还复来。
烹羊宰牛且为乐,会须一饮三百杯。
岑夫子,丹丘生,将进酒,杯莫停。
与君歌一曲,请君为我倾耳听。
钟鼓馔玉不足贵,但愿长醉不复醒。
古来圣贤皆寂寞,惟有饮者留其名。
陈王昔时宴平乐,斗酒十千恣欢谑。
主人何为言少钱,径须沽取对君酌。
五花马,千金裘,呼儿将出换美酒,与尔同销万古愁。"""
VOICE = "zh-CN-YunjianNeural"


def main() -> None:
"""Main function"""
communicate = edge_tts.Communicate(TEXT, VOICE, boundary="SentenceBoundary")
submaker = edge_tts.SubMaker()
stdout = sys.stdout
audio_bytes = []
for chunk in communicate.stream_sync():
if chunk["type"] == "audio":
audio_bytes.append(chunk["data"])
elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
submaker.feed(chunk)

stdout.write(f"audio file length: {len(audio_bytes)}")
stdout.write(submaker.get_srt())

if __name__ == "__main__":
main()
21 changes: 17 additions & 4 deletions src/edge_tts/communicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,9 +249,18 @@ def __init__(
proxy: Optional[str] = None,
connect_timeout: Optional[int] = 10,
receive_timeout: Optional[int] = 60,
boundary: str = "WordBoundary",
):
"""
Args:
boundary (str): The boundary to use for the TTS.
Defaults to "WordBoundary".
Valid values are "WordBoundary" and "SentenceBoundary".
If "WordBoundary", the TTS will return a word boundary for each word.
If "SentenceBoundary", the TTS will return a sentence boundary for each sentence. Which is more friendly to Chinese users.
"""
# Validate TTS settings and store the TTSConfig object.
self.tts_config = TTSConfig(voice, rate, volume, pitch)
self.tts_config = TTSConfig(voice, rate, volume, pitch, boundary)

# Validate the text parameter.
if not isinstance(text, str):
Expand Down Expand Up @@ -296,7 +305,7 @@ def __init__(
def __parse_metadata(self, data: bytes) -> TTSChunk:
for meta_obj in json.loads(data)["Metadata"]:
meta_type = meta_obj["Type"]
if meta_type == "WordBoundary":
if meta_type in ("WordBoundary", "SentenceBoundary"):
current_offset = (
meta_obj["Data"]["Offset"] + self.state["offset_compensation"]
)
Expand All @@ -315,12 +324,16 @@ def __parse_metadata(self, data: bytes) -> TTSChunk:
async def __stream(self) -> AsyncGenerator[TTSChunk, None]:
async def send_command_request() -> None:
"""Sends the command request to the service."""
wordBoundary = self.tts_config.boundary == "WordBoundary"
wd = "true" if wordBoundary else "false"
sq = "true" if not wordBoundary else "false"
await websocket.send_str(
f"X-Timestamp:{date_to_string()}\r\n"
"Content-Type:application/json; charset=utf-8\r\n"
"Path:speech.config\r\n\r\n"
'{"context":{"synthesis":{"audio":{"metadataoptions":{'
'"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},'
f'"sentenceBoundaryEnabled":"{sq}","wordBoundaryEnabled":"{wd}"'
'},'
'"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
"}}}}\r\n"
)
Expand Down Expand Up @@ -509,7 +522,7 @@ async def save(
audio.write(message["data"])
elif (
isinstance(metadata, TextIOWrapper)
and message["type"] == "WordBoundary"
and message["type"] in ("WordBoundary", "SentenceBoundary")
):
json.dump(message, metadata)
metadata.write("\n")
Expand Down
1 change: 1 addition & 0 deletions src/edge_tts/data_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class TTSConfig:
rate: str
volume: str
pitch: str
boundary: str

@staticmethod
def validate_string_param(param_name: str, param_value: str, pattern: str) -> str:
Expand Down
10 changes: 5 additions & 5 deletions src/edge_tts/submaker.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""SubMaker module is used to generate subtitles from WordBoundary events."""
"""SubMaker module is used to generate subtitles from WordBoundary and SentenceBoundary events."""

from typing import List

Expand All @@ -9,23 +9,23 @@

class SubMaker:
"""
SubMaker is used to generate subtitles from WordBoundary messages.
SubMaker is used to generate subtitles from WordBoundary and SentenceBoundary messages.
"""

def __init__(self) -> None:
self.cues: List[srt.Subtitle] = [] # type: ignore

def feed(self, msg: TTSChunk) -> None:
"""
Feed a WordBoundary message to the SubMaker object.
Feed a WordBoundary or SentenceBoundary message to the SubMaker object.

Args:
msg (dict): The WordBoundary message.
msg (dict): The WordBoundary or SentenceBoundary message.

Returns:
None
"""
if msg["type"] != "WordBoundary":
if msg["type"] not in ("WordBoundary", "SentenceBoundary"):
raise ValueError("Invalid message type, expected 'WordBoundary'")

self.cues.append(
Expand Down
8 changes: 4 additions & 4 deletions src/edge_tts/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
class TTSChunk(TypedDict):
"""TTS chunk data."""

type: Literal["audio", "WordBoundary"]
type: Literal["audio", "WordBoundary", "SentenceBoundary"]
data: NotRequired[bytes] # only for audio
duration: NotRequired[float] # only for WordBoundary
offset: NotRequired[float] # only for WordBoundary
text: NotRequired[str] # only for WordBoundary
duration: NotRequired[float] # only for WordBoundary and SentenceBoundary
offset: NotRequired[float] # only for WordBoundary and SentenceBoundary
text: NotRequired[str] # only for WordBoundary and SentenceBoundary


class VoiceTag(TypedDict):
Expand Down
2 changes: 1 addition & 1 deletion src/edge_tts/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ async def _run_tts(args: UtilArgs) -> None:
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_file.write(chunk["data"])
elif chunk["type"] == "WordBoundary":
elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
submaker.feed(chunk)

if args.words_in_cue > 0:
Expand Down
Loading