diff --git a/examples/streaming_with_subtitles.py b/examples/streaming_with_subtitles.py index e562d9e..1534010 100644 --- a/examples/streaming_with_subtitles.py +++ b/examples/streaming_with_subtitles.py @@ -29,7 +29,7 @@ async def amain() -> None: submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"]) with open(WEBVTT_FILE, "w", encoding="utf-8") as file: - file.write(submaker.generate_subs()) + file.write(submaker.generate_subs(TEXT)) if __name__ == "__main__": diff --git a/pylintrc b/pylintrc index 0801c2a..dffec02 100644 --- a/pylintrc +++ b/pylintrc @@ -307,8 +307,7 @@ min-public-methods=2 [EXCEPTIONS] # Exceptions that will emit a warning when caught. -overgeneral-exceptions=BaseException, - Exception +overgeneral-exceptions=builtins.BaseException,builtins.Exception [FORMAT] diff --git a/src/edge_tts/communicate.py b/src/edge_tts/communicate.py index 0cf9e80..6b7ad63 100644 --- a/src/edge_tts/communicate.py +++ b/src/edge_tts/communicate.py @@ -239,7 +239,7 @@ def __init__( volume: str = "+0%", pitch: str = "+0Hz", proxy: Optional[str] = None, - ): + ): # pylint: disable=too-many-arguments """ Initializes the Communicate class. @@ -302,7 +302,9 @@ def __init__( raise TypeError("proxy must be str") self.proxy: Optional[str] = proxy - async def stream(self) -> AsyncGenerator[Dict[str, Any], None]: + async def stream( # pylint: disable=too-many-statements + self, + ) -> AsyncGenerator[Dict[str, Any], None]: """Streams audio and metadata from the service.""" texts = split_text_by_byte_length( diff --git a/src/edge_tts/submaker.py b/src/edge_tts/submaker.py index f05c37e..57f452e 100644 --- a/src/edge_tts/submaker.py +++ b/src/edge_tts/submaker.py @@ -6,7 +6,8 @@ """ import math -from typing import List, Tuple +import re +from typing import Callable, List, Tuple, Union from xml.sax.saxutils import escape, unescape @@ -35,10 +36,60 @@ def mktimestamp(time_unit: float) -> str: hour = math.floor(time_unit / 10**7 / 3600) minute = math.floor((time_unit / 10**7 / 60) % 60) seconds = (time_unit / 10**7) % 60 - # return f"{hour:02d}:{minute:02d}:{seconds:06.3f}" return f"{hour:02d}:{minute:02d}:{seconds:06.3f}".replace(".", ",") +def _spinoff_sentence(sentence: str) -> Tuple[str, str, int]: + """ + _spinoff_sentence returns the sentence, the last word of the sentence, + and the number of times the last word appears in the sentence. + + Args: + sentence (str): The sentence to be processed. + + Returns: + Tuple[str, str, int]: The sentence, the last word of the sentence, + and the number of times the last word appears in the sentence. + """ + if not isinstance(sentence, str): + raise TypeError("sentence must be a string") + last_word = sentence[-1] + last_word_num = sentence.count(last_word) + return (sentence, last_word, last_word_num) + + +def process_text( + text: str, + *, + pattern_chi: str = r"[:“”‘’──{}【】·《》〈〉,、;。?!]", + spinoff_sentence: Callable[[str], Tuple[str, str, int]] = _spinoff_sentence, +) -> List[Tuple[str, str, int]]: + """ + process_text returns the three-dimensional list of the text to be passed + to SubMaker's generate_subs method. + + Args: + text (str): The text to be processed. + pattern_chi (str): The pattern of Chinese characters. + spinoff_sentence (function): The function used to process the sentence. + + Returns: + List[Tuple[str, str, int]]: The three-dimensional list of the text. + """ + if not isinstance(text, str): + raise TypeError("text must be a string") + if not isinstance(pattern_chi, str): + raise TypeError("pattern_chi must be a string") + if not callable(spinoff_sentence): + raise TypeError("spinoff_sentence must be a function") + sentences = re.split(pattern_chi, text) + sentences = [sentence.strip() for sentence in sentences if sentence.strip()] + three_dimensional_list = [] + for sentence in sentences: + three_dimensional_list.append(spinoff_sentence(sentence)) + return three_dimensional_list + + class SubMaker: """ SubMaker class @@ -54,7 +105,8 @@ def __init__(self) -> None: def create_sub(self, timestamp: Tuple[float, float], text: str) -> None: """ create_sub creates a subtitle with the given timestamp and text - and adds it to the list of subtitles + and adds it to the list of subtitles, this should be called + when receiving the wordboundary event from the service. Args: timestamp (tuple): The offset and duration of the subtitle. @@ -66,53 +118,58 @@ def create_sub(self, timestamp: Tuple[float, float], text: str) -> None: self.offset.append((timestamp[0], timestamp[0] + timestamp[1])) self.subs.append(text) - def generate_subs(self, three_dimensional_list, words_in_cue: int = 10) -> str: + def generate_subs(self, text: Union[str, List[Tuple[str, str, int]]]) -> str: """ generate_subs generates the complete subtitle file. Args: - words_in_cue (int): defines the number of words in a given cue + text: If the type is List[Tuple[str, str, int]], it is the three-dimensional + list of the text already processed. If the type is str, the text will + be processed automatically by process_text with the default parameters. + It should not use data from WordBoundary events, but the text that was + used to generate the audio. Returns: str: The complete subtitle file. - - three_dimensional_list: - [(sentence, last_word, last_word_num), (sentence, last_word, last_word_num)] """ if len(self.subs) != len(self.offset): raise ValueError("subs and offset are not of the same length") - if words_in_cue <= 0: - raise ValueError("words_in_cue must be greater than 0") + if isinstance(text, str): + text = process_text(text) + elif isinstance(text, list): + for sentence, last_word, last_word_num in text: + if not isinstance(sentence, str): + raise TypeError("sentence (first element) must be a string") + if not isinstance(last_word, str): + raise TypeError("last_word (second element) must be a string") + if not isinstance(last_word_num, int): + raise TypeError("last_word_num (third element) must be an integer") + else: + raise TypeError("text must be a string or a list") - # data = "WEBVTT\r\n\r\n" data = "" sub_state_count = 0 sub_state_start = -1.0 sub_state_subs = "" - sub_line_count = ( - 0 # new variable used to indicate which line of subtitle this is - ) + sub_line_count = 0 for idx, (offset, subs) in enumerate(zip(self.offset, self.subs)): start_time, end_time = offset subs = unescape(subs) # wordboundary is guaranteed not to contain whitespace - # if len(sub_state_subs) > 0: - # sub_state_subs += " " sub_state_subs += subs if sub_state_start == -1.0: sub_state_start = start_time sub_state_count += 1 - sentence, last_word, last_word_num = three_dimensional_list[sub_line_count] + sentence, last_word, last_word_num = text[sub_line_count] if ( sub_state_subs.count(last_word) == last_word_num or idx == len(self.offset) - 1 ): sub_line_count += 1 - # subs = sub_state_subs subs = sentence split_subs: List[str] = [ subs[i : i + 79] for i in range(0, len(subs), 79) diff --git a/src/edge_tts/util.py b/src/edge_tts/util.py index 01fabbc..500aab3 100644 --- a/src/edge_tts/util.py +++ b/src/edge_tts/util.py @@ -5,7 +5,6 @@ import argparse import asyncio -import re import sys from io import TextIOWrapper from typing import Any, TextIO, Union @@ -16,7 +15,7 @@ async def _print_voices(*, proxy: str) -> None: """Print all available voices.""" voices = await list_voices(proxy=proxy) - voices = sorted(voices, key=lambda voice: voice["ShortName"]) # type: ignore + voices = sorted(voices, key=lambda voice: voice["ShortName"]) for idx, voice in enumerate(voices): if idx != 0: print() @@ -35,12 +34,6 @@ async def _print_voices(*, proxy: str) -> None: print(f"{pretty_key_name}: {voice[key]}") -def _spinoff_sentence(sentence): - last_word = sentence[-1] - last_word_num = sentence.count(last_word) - return (sentence, last_word, last_word_num) - - async def _run_tts(args: Any) -> None: """Run TTS after parsing arguments from command line.""" @@ -67,14 +60,6 @@ async def _run_tts(args: Any) -> None: ) submaker: SubMaker = SubMaker() - - pattern_chi = r"[:“”‘’──{}【】·《》〈〉,、;。?!]" - sentences = re.split(pattern_chi, args.text) - sentences = [sentence.strip() for sentence in sentences if sentence.strip()] - three_dimensional_list = [] - for sentence in sentences: - three_dimensional_list.append(_spinoff_sentence(sentence)) - with open( args.write_media, "wb" ) if args.write_media else sys.stdout.buffer as audio_file: @@ -90,9 +75,7 @@ async def _run_tts(args: Any) -> None: else sys.stderr ) with sub_file: - sub_file.write( - submaker.generate_subs(three_dimensional_list=three_dimensional_list) - ) + sub_file.write(submaker.generate_subs(args.text)) async def amain() -> None: @@ -116,12 +99,6 @@ async def amain() -> None: parser.add_argument("--rate", help="set TTS rate. Default +0%%.", default="+0%") parser.add_argument("--volume", help="set TTS volume. Default +0%%.", default="+0%") parser.add_argument("--pitch", help="set TTS pitch. Default +0Hz.", default="+0Hz") - parser.add_argument( - "--words-in-cue", - help="number of words in a subtitle cue. Default: 10.", - default=10, - type=float, - ) parser.add_argument( "--write-media", help="send media output to file instead of stdout" )