[Enhancement] turn a sentence into a subtitle, no matter how many wor…

…ds in the sentence. Based on #137
rany2 · Dec 14, 2023 · 4d0fdf8 · 4d0fdf8
1 parent 5b7add9
commit 4d0fdf8
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 19 deletions.
diff --git a/src/edge_tts/submaker.py b/src/edge_tts/submaker.py
@@ -10,13 +10,14 @@
 from xml.sax.saxutils import escape, unescape
 
 
-def formatter(start_time: float, end_time: float, subdata: str) -> str:
+def formatter(sub_line_count: int, start_time: float, end_time: float, subdata: str) -> str:
     """
     formatter returns the timecode and the text of the subtitle.
     """
     return (
-        f"{mktimestamp(start_time)} --> {mktimestamp(end_time)}\r\n"
-        f"{escape(subdata)}\r\n\r\n"
+        f"{sub_line_count}\n"
+        f"{mktimestamp(start_time)} --> {mktimestamp(end_time)}\n"
+        f"{escape(subdata)}\n\n"
     )
 
 
@@ -32,8 +33,8 @@ def mktimestamp(time_unit: float) -> str:
     hour = math.floor(time_unit / 10**7 / 3600)
     minute = math.floor((time_unit / 10**7 / 60) % 60)
     seconds = (time_unit / 10**7) % 60
-    return f"{hour:02d}:{minute:02d}:{seconds:06.3f}"
-
+    # return f"{hour:02d}:{minute:02d}:{seconds:06.3f}"
+    return f"{hour:02d}:{minute:02d}:{seconds:06.3f}".replace(".", ",")
 
 class SubMaker:
     """
@@ -62,7 +63,7 @@ def create_sub(self, timestamp: Tuple[float, float], text: str) -> None:
         self.offset.append((timestamp[0], timestamp[0] + timestamp[1]))
         self.subs.append(text)
 
-    def generate_subs(self, words_in_cue: int = 10) -> str:
+    def generate_subs(self, three_dimensional_list, words_in_cue: int = 10) -> str:
         """
         generate_subs generates the complete subtitle file.
 
@@ -71,32 +72,40 @@ def generate_subs(self, words_in_cue: int = 10) -> str:
 
         Returns:
             str: The complete subtitle file.
+
+        three_dimensional_list：
+            [(sentence, last_word, last_word_num)， (sentence, last_word, last_word_num)]
         """
         if len(self.subs) != len(self.offset):
             raise ValueError("subs and offset are not of the same length")
 
         if words_in_cue <= 0:
             raise ValueError("words_in_cue must be greater than 0")
 
-        data = "WEBVTT\r\n\r\n"
+        # data = "WEBVTT\r\n\r\n"
+        data = ''
         sub_state_count = 0
         sub_state_start = -1.0
         sub_state_subs = ""
+        sub_line_count = 0     # new variable used to indicate which line of subtitle this is
         for idx, (offset, subs) in enumerate(zip(self.offset, self.subs)):
             start_time, end_time = offset
             subs = unescape(subs)
 
             # wordboundary is guaranteed not to contain whitespace
-            if len(sub_state_subs) > 0:
-                sub_state_subs += " "
+            # if len(sub_state_subs) > 0:
+            #     sub_state_subs += " "
             sub_state_subs += subs
 
             if sub_state_start == -1.0:
                 sub_state_start = start_time
             sub_state_count += 1
 
-            if sub_state_count == words_in_cue or idx == len(self.offset) - 1:
-                subs = sub_state_subs
+            sentence, last_word, last_word_num = three_dimensional_list[sub_line_count]
+            if sub_state_subs.count(last_word) == last_word_num or idx == len(self.offset) - 1:
+                sub_line_count += 1
+                # subs = sub_state_subs
+                subs = sentence
                 split_subs: List[str] = [
                     subs[i : i + 79] for i in range(0, len(subs), 79)
                 ]
@@ -115,11 +124,12 @@ def generate_subs(self, words_in_cue: int = 10) -> str:
                         split_subs[i] += "-"
 
                 data += formatter(
+                    sub_line_count=sub_line_count,
                     start_time=sub_state_start,
                     end_time=end_time,
-                    subdata="\r\n".join(split_subs),
+                    subdata="".join(split_subs),
                 )
                 sub_state_count = 0
                 sub_state_start = -1
                 sub_state_subs = ""
-        return data
+        return data
diff --git a/src/edge_tts/util.py b/src/edge_tts/util.py
@@ -11,11 +11,13 @@
 
 from edge_tts import Communicate, SubMaker, list_voices
 
+import asyncio
+import re
 
 async def _print_voices(*, proxy: str) -> None:
     """Print all available voices."""
     voices = await list_voices(proxy=proxy)
-    voices = sorted(voices, key=lambda voice: voice["ShortName"])
+    voices = sorted(voices, key=lambda voice: voice["ShortName"])  # type: ignore
     for idx, voice in enumerate(voices):
         if idx != 0:
             print()
@@ -34,6 +36,23 @@ async def _print_voices(*, proxy: str) -> None:
             print(f"{pretty_key_name}: {voice[key]}")
 
 
+def spinoff_sentence(sentence):
+    last_word = sentence[-1]
+    last_word_num = sentence.count(last_word)
+    return (sentence, last_word, last_word_num)
+
+async def tts_subtitle(text, three_dimensional_list, voice, audio_path, subtitle_path):
+    communicate = edge_tts.Communicate(text, voice)
+    submaker = edge_tts.SubMaker()
+    with open(audio_path, "wb") as file:
+        async for chunk in communicate.stream():
+            if chunk["type"] == "audio":
+                file.write(chunk["data"])
+            elif chunk["type"] == "WordBoundary":
+                submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])
+    with open(subtitle_path, "w", encoding="utf-8") as file:
+        file.write(submaker.generate_subs(three_dimensional_list=three_dimensional_list))
+
 async def _run_tts(args: Any) -> None:
     """Run TTS after parsing arguments from command line."""
 
@@ -57,25 +76,33 @@ async def _run_tts(args: Any) -> None:
         proxy=args.proxy,
         rate=args.rate,
         volume=args.volume,
-        pitch=args.pitch,
     )
-    subs: SubMaker = SubMaker()
+
+    submaker: SubMaker = SubMaker()
+
+    pattern_chi = r"[：“”‘’──{}【】·《》〈〉，、；。？！]"
+    sentences = re.split(pattern_chi, args.text)
+    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
+    three_dimensional_list = []
+    for sentence in sentences:
+        three_dimensional_list.append(spinoff_sentence(sentence))
+
     with open(
         args.write_media, "wb"
     ) if args.write_media else sys.stdout.buffer as audio_file:
         async for chunk in tts.stream():
             if chunk["type"] == "audio":
                 audio_file.write(chunk["data"])
             elif chunk["type"] == "WordBoundary":
-                subs.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])
+                submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])
 
     sub_file: Union[TextIOWrapper, TextIO] = (
         open(args.write_subtitles, "w", encoding="utf-8")
         if args.write_subtitles
         else sys.stderr
     )
     with sub_file:
-        sub_file.write(subs.generate_subs(args.words_in_cue))
+        sub_file.write(submaker.generate_subs(three_dimensional_list=three_dimensional_list))
 
 
 async def amain() -> None:
@@ -98,7 +125,6 @@ async def amain() -> None:
     )
     parser.add_argument("--rate", help="set TTS rate. Default +0%%.", default="+0%")
     parser.add_argument("--volume", help="set TTS volume. Default +0%%.", default="+0%")
-    parser.add_argument("--pitch", help="set TTS pitch. Default +0Hz.", default="+0Hz")
     parser.add_argument(
         "--words-in-cue",
         help="number of words in a subtitle cue. Default: 10.",