rany2 · chnyangjie · Jan 9, 2025 · Jan 10, 2025
diff --git a/.gitignore b/.gitignore
@@ -162,3 +162,4 @@ cython_debug/
 # Edge-TTS specific ignores
 *.mp3
 *.srt
+/.idea/
diff --git a/examples/async_audio_streaming_with_predefined_voice_and_subtitles.py b/examples/async_audio_streaming_with_predefined_voice_and_subtitles.py
@@ -21,7 +21,7 @@ async def amain() -> None:
         async for chunk in communicate.stream():
             if chunk["type"] == "audio":
                 file.write(chunk["data"])
-            elif chunk["type"] == "WordBoundary":
+            elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
                 submaker.feed(chunk)
 
     with open(SRT_FILE, "w", encoding="utf-8") as file:

diff --git a/examples/sync_audio_streaming_with_predefined_voice_subtitles.py b/examples/sync_audio_streaming_with_predefined_voice_subtitles.py
@@ -20,7 +20,7 @@ def main() -> None:
         for chunk in communicate.stream_sync():
             if chunk["type"] == "audio":
                 file.write(chunk["data"])
-            elif chunk["type"] == "WordBoundary":
+            elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
                 submaker.feed(chunk)
 
     with open(SRT_FILE, "w", encoding="utf-8") as file:

diff --git a/examples/sync_audio_streaming_with_predefined_voice_subtitles_print2stdout.py b/examples/sync_audio_streaming_with_predefined_voice_subtitles_print2stdout.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+"""Sync variant of the async .stream() method to
+get audio chunks and feed them to SubMaker to
+generate subtitles"""
+import sys
+
+import edge_tts
+
+TEXT = """君不见，黄河之水天上来，奔流到海不复回。
+君不见，高堂明镜悲白发，朝如青丝暮成雪。
+人生得意须尽欢，莫使金樽空对月。
+天生我材必有用，千金散尽还复来。
+烹羊宰牛且为乐，会须一饮三百杯。
+岑夫子，丹丘生，将进酒，杯莫停。
+与君歌一曲，请君为我倾耳听。
+钟鼓馔玉不足贵，但愿长醉不复醒。
+古来圣贤皆寂寞，惟有饮者留其名。
+陈王昔时宴平乐，斗酒十千恣欢谑。
+主人何为言少钱，径须沽取对君酌。
+五花马，千金裘，呼儿将出换美酒，与尔同销万古愁。"""
+VOICE = "zh-CN-YunjianNeural"
+
+
+def main() -> None:
+    """Main function"""
+    communicate = edge_tts.Communicate(TEXT, VOICE, boundary="SentenceBoundary")
+    submaker = edge_tts.SubMaker()
+    stdout = sys.stdout
+    audio_bytes = []
+    for chunk in communicate.stream_sync():
+        if chunk["type"] == "audio":
+            audio_bytes.append(chunk["data"])
+        elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
+            submaker.feed(chunk)
+
+    stdout.write(f"audio file length: {len(audio_bytes)}")
+    stdout.write(submaker.get_srt())
+
+if __name__ == "__main__":
+    main()
diff --git a/src/edge_tts/communicate.py b/src/edge_tts/communicate.py
@@ -249,9 +249,18 @@ def __init__(
         proxy: Optional[str] = None,
         connect_timeout: Optional[int] = 10,
         receive_timeout: Optional[int] = 60,
+        boundary: str = "WordBoundary",
     ):
+        """
+        Args:
+            boundary (str): The boundary to use for the TTS.
+                Defaults to "WordBoundary".
+                Valid values are "WordBoundary" and "SentenceBoundary".
+                If "WordBoundary", the TTS will return a word boundary for each word.
+                If "SentenceBoundary", the TTS will return a sentence boundary for each sentence. Which is more friendly to Chinese users.
+        """
         # Validate TTS settings and store the TTSConfig object.
-        self.tts_config = TTSConfig(voice, rate, volume, pitch)
+        self.tts_config = TTSConfig(voice, rate, volume, pitch, boundary)
 
         # Validate the text parameter.
         if not isinstance(text, str):
@@ -296,7 +305,7 @@ def __init__(
     def __parse_metadata(self, data: bytes) -> TTSChunk:
         for meta_obj in json.loads(data)["Metadata"]:
             meta_type = meta_obj["Type"]
-            if meta_type == "WordBoundary":
+            if meta_type in ("WordBoundary", "SentenceBoundary"):
                 current_offset = (
                     meta_obj["Data"]["Offset"] + self.state["offset_compensation"]
                 )
@@ -315,12 +324,16 @@ def __parse_metadata(self, data: bytes) -> TTSChunk:
     async def __stream(self) -> AsyncGenerator[TTSChunk, None]:
         async def send_command_request() -> None:
             """Sends the command request to the service."""
+            wordBoundary = self.tts_config.boundary == "WordBoundary"
+            wd = "true" if wordBoundary else "false"
+            sq = "true" if not wordBoundary else "false"
             await websocket.send_str(
                 f"X-Timestamp:{date_to_string()}\r\n"
                 "Content-Type:application/json; charset=utf-8\r\n"
                 "Path:speech.config\r\n\r\n"
                 '{"context":{"synthesis":{"audio":{"metadataoptions":{'
-                '"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},'
+                f'"sentenceBoundaryEnabled":"{sq}","wordBoundaryEnabled":"{wd}"'
+                '},'
                 '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
                 "}}}}\r\n"
             )
@@ -509,7 +522,7 @@ async def save(
                     audio.write(message["data"])
                 elif (
                     isinstance(metadata, TextIOWrapper)
-                    and message["type"] == "WordBoundary"
+                    and message["type"] in ("WordBoundary", "SentenceBoundary")
                 ):
                     json.dump(message, metadata)
                     metadata.write("\n")

diff --git a/src/edge_tts/data_classes.py b/src/edge_tts/data_classes.py
@@ -17,6 +17,7 @@ class TTSConfig:
     rate: str
     volume: str
     pitch: str
+    boundary: str
 
     @staticmethod
     def validate_string_param(param_name: str, param_value: str, pattern: str) -> str:

diff --git a/src/edge_tts/submaker.py b/src/edge_tts/submaker.py
@@ -1,4 +1,4 @@
-"""SubMaker module is used to generate subtitles from WordBoundary events."""
+"""SubMaker module is used to generate subtitles from WordBoundary and SentenceBoundary events."""
 
 from typing import List
 
@@ -9,23 +9,23 @@
 
 class SubMaker:
     """
-    SubMaker is used to generate subtitles from WordBoundary messages.
+    SubMaker is used to generate subtitles from WordBoundary and SentenceBoundary messages.
     """
 
     def __init__(self) -> None:
         self.cues: List[srt.Subtitle] = []  # type: ignore
 
     def feed(self, msg: TTSChunk) -> None:
         """
-        Feed a WordBoundary message to the SubMaker object.
+        Feed a WordBoundary or SentenceBoundary message to the SubMaker object.
 
         Args:
-            msg (dict): The WordBoundary message.
+            msg (dict): The WordBoundary or SentenceBoundary message.
 
         Returns:
             None
         """
-        if msg["type"] != "WordBoundary":
+        if msg["type"] not in ("WordBoundary", "SentenceBoundary"):
             raise ValueError("Invalid message type, expected 'WordBoundary'")
 
         self.cues.append(

diff --git a/src/edge_tts/typing.py b/src/edge_tts/typing.py
@@ -10,11 +10,11 @@
 class TTSChunk(TypedDict):
     """TTS chunk data."""
 
-    type: Literal["audio", "WordBoundary"]
+    type: Literal["audio", "WordBoundary", "SentenceBoundary"]
     data: NotRequired[bytes]  # only for audio
-    duration: NotRequired[float]  # only for WordBoundary
-    offset: NotRequired[float]  # only for WordBoundary
-    text: NotRequired[str]  # only for WordBoundary
+    duration: NotRequired[float]  # only for WordBoundary and SentenceBoundary
+    offset: NotRequired[float]  # only for WordBoundary and SentenceBoundary
+    text: NotRequired[str]  # only for WordBoundary and SentenceBoundary
 
 
 class VoiceTag(TypedDict):

diff --git a/src/edge_tts/util.py b/src/edge_tts/util.py
@@ -72,7 +72,7 @@ async def _run_tts(args: UtilArgs) -> None:
         async for chunk in communicate.stream():
             if chunk["type"] == "audio":
                 audio_file.write(chunk["data"])
-            elif chunk["type"] == "WordBoundary":
+            elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
                 submaker.feed(chunk)
 
         if args.words_in_cue > 0:
-Original file line number
+Diff line change
@@ Expand Up / @@ -162,3 +162,4 @@ cython_debug/ @@
     # Edge-TTS specific ignores
     *.mp3
     *.srt
+    /.idea/