Skip to content

Commit

Permalink
[Enhancement] turn a sentence into a subtitle, no matter how many wor…
Browse files Browse the repository at this point in the history
…ds in the sentence.

Based on #137
  • Loading branch information
suxinde2009 committed Dec 14, 2023
1 parent 5b7add9 commit 4d0fdf8
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 19 deletions.
36 changes: 23 additions & 13 deletions src/edge_tts/submaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
from xml.sax.saxutils import escape, unescape


def formatter(start_time: float, end_time: float, subdata: str) -> str:
def formatter(sub_line_count: int, start_time: float, end_time: float, subdata: str) -> str:
"""
formatter returns the timecode and the text of the subtitle.
"""
return (
f"{mktimestamp(start_time)} --> {mktimestamp(end_time)}\r\n"
f"{escape(subdata)}\r\n\r\n"
f"{sub_line_count}\n"
f"{mktimestamp(start_time)} --> {mktimestamp(end_time)}\n"
f"{escape(subdata)}\n\n"
)


Expand All @@ -32,8 +33,8 @@ def mktimestamp(time_unit: float) -> str:
hour = math.floor(time_unit / 10**7 / 3600)
minute = math.floor((time_unit / 10**7 / 60) % 60)
seconds = (time_unit / 10**7) % 60
return f"{hour:02d}:{minute:02d}:{seconds:06.3f}"

# return f"{hour:02d}:{minute:02d}:{seconds:06.3f}"
return f"{hour:02d}:{minute:02d}:{seconds:06.3f}".replace(".", ",")

class SubMaker:
"""
Expand Down Expand Up @@ -62,7 +63,7 @@ def create_sub(self, timestamp: Tuple[float, float], text: str) -> None:
self.offset.append((timestamp[0], timestamp[0] + timestamp[1]))
self.subs.append(text)

def generate_subs(self, words_in_cue: int = 10) -> str:
def generate_subs(self, three_dimensional_list, words_in_cue: int = 10) -> str:
"""
generate_subs generates the complete subtitle file.
Expand All @@ -71,32 +72,40 @@ def generate_subs(self, words_in_cue: int = 10) -> str:
Returns:
str: The complete subtitle file.
three_dimensional_list:
[(sentence, last_word, last_word_num), (sentence, last_word, last_word_num)]
"""
if len(self.subs) != len(self.offset):
raise ValueError("subs and offset are not of the same length")

if words_in_cue <= 0:
raise ValueError("words_in_cue must be greater than 0")

data = "WEBVTT\r\n\r\n"
# data = "WEBVTT\r\n\r\n"
data = ''
sub_state_count = 0
sub_state_start = -1.0
sub_state_subs = ""
sub_line_count = 0 # new variable used to indicate which line of subtitle this is
for idx, (offset, subs) in enumerate(zip(self.offset, self.subs)):
start_time, end_time = offset
subs = unescape(subs)

# wordboundary is guaranteed not to contain whitespace
if len(sub_state_subs) > 0:
sub_state_subs += " "
# if len(sub_state_subs) > 0:
# sub_state_subs += " "
sub_state_subs += subs

if sub_state_start == -1.0:
sub_state_start = start_time
sub_state_count += 1

if sub_state_count == words_in_cue or idx == len(self.offset) - 1:
subs = sub_state_subs
sentence, last_word, last_word_num = three_dimensional_list[sub_line_count]
if sub_state_subs.count(last_word) == last_word_num or idx == len(self.offset) - 1:
sub_line_count += 1
# subs = sub_state_subs
subs = sentence
split_subs: List[str] = [
subs[i : i + 79] for i in range(0, len(subs), 79)
]
Expand All @@ -115,11 +124,12 @@ def generate_subs(self, words_in_cue: int = 10) -> str:
split_subs[i] += "-"

data += formatter(
sub_line_count=sub_line_count,
start_time=sub_state_start,
end_time=end_time,
subdata="\r\n".join(split_subs),
subdata="".join(split_subs),
)
sub_state_count = 0
sub_state_start = -1
sub_state_subs = ""
return data
return data
38 changes: 32 additions & 6 deletions src/edge_tts/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@

from edge_tts import Communicate, SubMaker, list_voices

import asyncio
import re

async def _print_voices(*, proxy: str) -> None:
"""Print all available voices."""
voices = await list_voices(proxy=proxy)
voices = sorted(voices, key=lambda voice: voice["ShortName"])
voices = sorted(voices, key=lambda voice: voice["ShortName"]) # type: ignore
for idx, voice in enumerate(voices):
if idx != 0:
print()
Expand All @@ -34,6 +36,23 @@ async def _print_voices(*, proxy: str) -> None:
print(f"{pretty_key_name}: {voice[key]}")


def spinoff_sentence(sentence):
last_word = sentence[-1]
last_word_num = sentence.count(last_word)
return (sentence, last_word, last_word_num)

async def tts_subtitle(text, three_dimensional_list, voice, audio_path, subtitle_path):
communicate = edge_tts.Communicate(text, voice)
submaker = edge_tts.SubMaker()
with open(audio_path, "wb") as file:
async for chunk in communicate.stream():
if chunk["type"] == "audio":
file.write(chunk["data"])
elif chunk["type"] == "WordBoundary":
submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])
with open(subtitle_path, "w", encoding="utf-8") as file:
file.write(submaker.generate_subs(three_dimensional_list=three_dimensional_list))

async def _run_tts(args: Any) -> None:
"""Run TTS after parsing arguments from command line."""

Expand All @@ -57,25 +76,33 @@ async def _run_tts(args: Any) -> None:
proxy=args.proxy,
rate=args.rate,
volume=args.volume,
pitch=args.pitch,
)
subs: SubMaker = SubMaker()

submaker: SubMaker = SubMaker()

pattern_chi = r"[:“”‘’──{}【】·《》〈〉,、;。?!]"
sentences = re.split(pattern_chi, args.text)
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
three_dimensional_list = []
for sentence in sentences:
three_dimensional_list.append(spinoff_sentence(sentence))

with open(
args.write_media, "wb"
) if args.write_media else sys.stdout.buffer as audio_file:
async for chunk in tts.stream():
if chunk["type"] == "audio":
audio_file.write(chunk["data"])
elif chunk["type"] == "WordBoundary":
subs.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])
submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])

sub_file: Union[TextIOWrapper, TextIO] = (
open(args.write_subtitles, "w", encoding="utf-8")
if args.write_subtitles
else sys.stderr
)
with sub_file:
sub_file.write(subs.generate_subs(args.words_in_cue))
sub_file.write(submaker.generate_subs(three_dimensional_list=three_dimensional_list))


async def amain() -> None:
Expand All @@ -98,7 +125,6 @@ async def amain() -> None:
)
parser.add_argument("--rate", help="set TTS rate. Default +0%%.", default="+0%")
parser.add_argument("--volume", help="set TTS volume. Default +0%%.", default="+0%")
parser.add_argument("--pitch", help="set TTS pitch. Default +0Hz.", default="+0Hz")
parser.add_argument(
"--words-in-cue",
help="number of words in a subtitle cue. Default: 10.",
Expand Down

0 comments on commit 4d0fdf8

Please sign in to comment.