diff --git a/README.md b/README.md index b602aa9e..acc11b5d 100644 --- a/README.md +++ b/README.md @@ -397,6 +397,14 @@ These methods are responsible for executing the text-to-audio synthesis and play - **Default**: `True` - **Description**: When set to `True`, the method will prioritize speed, generating and playing sentence fragments faster. This is useful for applications where latency matters. +###### `fast_sentence_fragment_allsentences` (bool) +- **Default**: `False` +- **Description**: When set to `True`, applies the fast sentence fragment processing to all sentences, not just the first one. + +###### `fast_sentence_fragment_allsentences_multiple` (bool) +- **Default**: `False` +- **Description**: When set to `True`, allows yielding multiple sentence fragments instead of just a single one. + ###### `buffer_threshold_seconds` (float) - **Default**: `0.0` - **Description**: Specifies the time in seconds for the buffering threshold, which impacts the smoothness and continuity of audio playback. @@ -453,6 +461,10 @@ These methods are responsible for executing the text-to-audio synthesis and play - **Default**: `12` - **Description**: The number of characters used to establish context for sentence boundary detection. A larger context improves the accuracy of detecting sentence boundaries. +###### `context_size_look_overhead` (int) +- **Default**: `12` +- **Description**: Additional context size for looking ahead when detecting sentence boundaries. + ###### `muted` (bool) - **Default**: `False` - **Description**: If True, disables audio playback via local speakers. Useful when you want to synthesize to a file or process audio chunks without playing them. @@ -465,8 +477,6 @@ These methods are responsible for executing the text-to-audio synthesis and play - **Default**: `15` - **Description**: The number of words after which the first sentence fragment is forced to be yielded. -By understanding and setting these parameters and methods appropriately, you can tailor the `TextToAudioStream` to meet the specific needs of your application. - ### CUDA installation These steps are recommended for those who require **better performance** and have a compatible NVIDIA GPU. diff --git a/RealtimeTTS/text_to_stream.py b/RealtimeTTS/text_to_stream.py index d82a7e75..b8ca9138 100644 --- a/RealtimeTTS/text_to_stream.py +++ b/RealtimeTTS/text_to_stream.py @@ -160,6 +160,8 @@ def feed(self, def play_async(self, fast_sentence_fragment: bool = True, + fast_sentence_fragment_allsentences: bool = True, + fast_sentence_fragment_allsentences_multiple: bool = False, buffer_threshold_seconds: float = 0.0, minimum_sentence_length: int = 10, minimum_first_fragment_length: int = 10, @@ -173,6 +175,7 @@ def play_async(self, tokenize_sentences=None, language: str = "", context_size: int = 12, + context_size_look_overhead: int = 12, muted: bool = False, sentence_fragment_delimiters: str = ".?!;:,\n…)]}。-", force_first_fragment_after_words=15, @@ -183,10 +186,10 @@ def play_async(self, if not self.is_playing_flag: self.is_playing_flag = True # Pass additional parameter to differentiate external call - args = (fast_sentence_fragment, buffer_threshold_seconds, minimum_sentence_length, + args = (fast_sentence_fragment, fast_sentence_fragment_allsentences, fast_sentence_fragment_allsentences_multiple, buffer_threshold_seconds, minimum_sentence_length, minimum_first_fragment_length, log_synthesized_text, reset_generated_text, output_wavfile, on_sentence_synthesized, before_sentence_synthesized, on_audio_chunk, tokenizer, tokenize_sentences, - language, context_size, muted, sentence_fragment_delimiters, + language, context_size, context_size_look_overhead, muted, sentence_fragment_delimiters, force_first_fragment_after_words, True) self.play_thread = threading.Thread(target=self.play, args=args) self.play_thread.start() @@ -200,6 +203,8 @@ def play_async(self, def play( self, fast_sentence_fragment: bool = True, + fast_sentence_fragment_allsentences: bool = False, + fast_sentence_fragment_allsentences_multiple: bool = False, buffer_threshold_seconds: float = 0.0, minimum_sentence_length: int = 10, minimum_first_fragment_length: int = 10, @@ -213,6 +218,7 @@ def play( tokenize_sentences=None, language: str = "en", context_size: int = 12, + context_size_look_overhead: int = 12, muted: bool = False, sentence_fragment_delimiters: str = ".?!;:,\n…)]}。-", force_first_fragment_after_words=15, @@ -225,6 +231,8 @@ def play( Args: - fast_sentence_fragment: Determines if sentence fragments should be quickly yielded. Useful when a faster response is desired even if a sentence isn't complete. + - fast_sentence_fragment_allsentences: Fast_sentence_fragment only works on the first sentence. Set this to True if you want to work it on every sentence. + - fast_sentence_fragment_allsentences_multiple: Can yield multiple sentence fragments, not only a single one. - buffer_threshold_seconds (float): Time in seconds for the buffering threshold, influencing the flow and continuity of audio playback. Set to 0 to deactivate. Default is 0. - How it Works: The system verifies whether there is more audio content in the buffer than the duration defined by buffer_threshold_seconds. If so, it proceeds to synthesize the next sentence, capitalizing on the remaining audio to maintain smooth delivery. A higher value means more audio is pre-buffered, which minimizes pauses during playback. Adjust this upwards if you encounter interruptions. - Helps to decide when to generate more audio based on buffered content. @@ -325,7 +333,24 @@ def play( self.player.on_audio_chunk = self._on_audio_chunk # Generate sentences from the characters - generate_sentences = s2s.generate_sentences(self.thread_safe_char_iter, context_size=context_size, minimum_sentence_length=minimum_sentence_length, minimum_first_fragment_length=minimum_first_fragment_length, quick_yield_single_sentence_fragment=fast_sentence_fragment, cleanup_text_links=True, cleanup_text_emojis=True, tokenize_sentences=tokenize_sentences, tokenizer=tokenizer, language=language, log_characters=self.log_characters, sentence_fragment_delimiters=sentence_fragment_delimiters, force_first_fragment_after_words=force_first_fragment_after_words) + generate_sentences = s2s.generate_sentences( + self.thread_safe_char_iter, + context_size=context_size, + context_size_look_overhead=context_size_look_overhead, + minimum_sentence_length=minimum_sentence_length, + minimum_first_fragment_length=minimum_first_fragment_length, + quick_yield_single_sentence_fragment=fast_sentence_fragment, + quick_yield_for_all_sentences=fast_sentence_fragment_allsentences, + quick_yield_every_fragment=fast_sentence_fragment_allsentences_multiple, + cleanup_text_links=True, + cleanup_text_emojis=True, + tokenize_sentences=tokenize_sentences, + tokenizer=tokenizer, + language=language, + log_characters=self.log_characters, + sentence_fragment_delimiters=sentence_fragment_delimiters, + force_first_fragment_after_words=force_first_fragment_after_words + ) # Create the synthesis chunk generator with the given sentences chunk_generator = self._synthesis_chunk_generator(generate_sentences, buffer_threshold_seconds, log_synthesized_text) diff --git a/requirements.txt b/requirements.txt index 32c25f0d..72d74315 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,13 +5,13 @@ pyttsx3==2.90 azure-cognitiveservices-speech==1.38.0 # elevenlabs is for ElevenlabsEngine -elevenlabs==1.3.1 +elevenlabs==1.5.0 # openai is for OpenAIEngine -openai==1.35.10 +openai==1.36.1 # gtts is for GTTSEngine -gtts==2.5.1 +gtts==2.5.2 # coqui_tts is for CoquiEngine coqui_tts==0.24.1 @@ -19,7 +19,7 @@ coqui_tts==0.24.1 # stream2sentence is to quickly convert streamed text into sentences for real-time synthesis -stream2sentence==0.2.3 +stream2sentence==0.2.5 # pydub is used to convert chunks from mp3 to pcm (for openai tts) pydub==0.25.1 diff --git a/setup.py b/setup.py index 2bc9db36..e0291aca 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ def parse_requirements(filename): setuptools.setup( name="RealTimeTTS", - version="0.4.21", + version="0.4.5", author="Kolja Beigel", author_email="kolja.beigel@web.de", description="Stream text into audio with an easy-to-use, highly configurable library delivering voice output with minimal latency.",