From f8f7019da9198b14598066d6def7936176198170 Mon Sep 17 00:00:00 2001 From: KoljaB Date: Sat, 11 Jan 2025 17:41:05 +0100 Subject: [PATCH] kokoro support --- README.md | 8 +- RealtimeTTS/__init__.py | 5 + RealtimeTTS/engines/__init__.py | 5 + RealtimeTTS/engines/kokoro_engine.py | 214 +++++++++++++++++++++++++++ setup.py | 2 +- tests/__init__.py | 0 tests/kokoro_test.py | 47 ++++++ 7 files changed, 278 insertions(+), 3 deletions(-) create mode 100644 RealtimeTTS/engines/kokoro_engine.py create mode 100644 tests/__init__.py create mode 100644 tests/kokoro_test.py diff --git a/README.md b/README.md index f241da56..83fb5312 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ https://github.com/KoljaB/RealtimeTTS/assets/7604638/87dcd9a5-3a4e-4f57-be45-837 - **High-Quality Audio** - generates clear and natural-sounding speech - **Multiple TTS Engine Support** - - supports OpenAI TTS, Elevenlabs, Azure Speech Services, Coqui TTS, StyleTTS2, Piper, gTTS, Edge TTS, Parler TTS and System TTS + - supports OpenAI TTS, Elevenlabs, Azure Speech Services, Coqui TTS, StyleTTS2, Piper, gTTS, Edge TTS, Parler TTS, Kokoro and System TTS - **Multilingual** - **Robust and Reliable**: - ensures continuous operation through a fallback mechanism @@ -58,7 +58,11 @@ Let me know if you need any adjustments or additional languages! ## Updates -Latest Version: v0.4.40 +Latest Version: v0.4.41 + +- **New Engine:** KokoroEngine + - **Installation Tutorial:** [Usage on Huggingface](https://huggingface.co/hexgrad/Kokoro-82M#usage) + - **Test File Example:** [kokoro_test.py](https://github.com/KoljaB/RealtimeTTS/blob/master/tests/kokoro_test.py) - **New Engine:** PiperEngine - **Installation Tutorial:** [Watch on YouTube](https://www.youtube.com/watch?v=GGvdq3giiTQ) diff --git a/RealtimeTTS/__init__.py b/RealtimeTTS/__init__.py index c8adea0a..0c58eb89 100644 --- a/RealtimeTTS/__init__.py +++ b/RealtimeTTS/__init__.py @@ -57,3 +57,8 @@ from .engines import PiperEngine, PiperVoice # noqa: F401 except ImportError as e: PiperEngine, PiperVoice = None + +try: + from .engines import KokoroEngine # noqa: F401 +except ImportError as e: + KokoroEngine = None diff --git a/RealtimeTTS/engines/__init__.py b/RealtimeTTS/engines/__init__.py index c93b27a0..fd5308d1 100644 --- a/RealtimeTTS/engines/__init__.py +++ b/RealtimeTTS/engines/__init__.py @@ -56,3 +56,8 @@ from .piper_engine import PiperEngine, PiperVoice # noqa: F401 except ImportError as e: PiperEngine, PiperVoice = None + +try: + from .kokoro_engine import KokoroEngine # noqa: F401 +except ImportError as e: + KokoroEngine = None diff --git a/RealtimeTTS/engines/kokoro_engine.py b/RealtimeTTS/engines/kokoro_engine.py new file mode 100644 index 00000000..4456cb10 --- /dev/null +++ b/RealtimeTTS/engines/kokoro_engine.py @@ -0,0 +1,214 @@ +""" +Needs: +- pip install munch + +""" +from .base_engine import BaseEngine +from typing import Optional, Dict +from queue import Queue +import pyaudio +import torch +import time +import sys +import os + +# You may need these if you plan to write WAV files or do numeric conversions +import numpy as np +# from scipy.io.wavfile import write # Uncomment if you want to save WAV files + +class KokoroEngine(BaseEngine): + """ + A simple TTS engine that uses the Kokoro model for voice synthesis. + Loads all voices on init, allows setting a current voice, and generates audio. + """ + + def __init__( + self, + kokoro_root: str, + model_path: str = "kokoro-v0_19.pth", + voice_names: Optional[list] = None, + voices_dir: str = "voices", + debug: bool = False + ): + """ + Initializes the Kokoro text-to-speech engine. + + Args: + model_path (str): Path to the Kokoro model checkpoint. + voice_names (list): List of voice names you want to load. Defaults to a set of known voices. + voices_dir (str): Directory where voice .pt files are stored. + debug (bool): If True, prints debug info. + """ + super().__init__() # Ensure BaseEngine is properly initialized + self.kokoro_root = kokoro_root.replace("\\", "/") + + # Add the root directory to sys.path + root_directory = os.path.abspath(os.path.join(os.path.dirname(__file__), self.kokoro_root)) + print(f"Adding {root_directory} to sys.path") + sys.path.append(root_directory) + + self.debug = debug + self.queue = Queue() # Queue for feeding audio data to the output + self.device = "cuda" if torch.cuda.is_available() else "cpu" + + # Build the main model once + from models import build_model # Kokoro-specific import + if not os.path.exists(model_path): + model_path = os.path.join(self.kokoro_root, model_path) + self.model = build_model(model_path, self.device) + if self.debug: + print(f"Kokoro model loaded from: {model_path} (device: {self.device})") + + # If user didn't provide a voice list, fall back to defaults + if voice_names is None: + # This is just an example set; customize as needed + voice_names = [ + "af_nicole", + "af", + "af_bella", + "af_sarah", + "am_adam", + "am_michael", + "bf_emma", + "bf_isabella", + "bm_george", + "bm_lewis", + "af_sky" + ] + self.voicepacks_dir = voices_dir + self.voicepacks: Dict[str, torch.nn.Module] = {} + self._load_voices(voice_names) + + # Pick the first voice as current by default (or None) + self.current_voice_name = voice_names[0] if voice_names else None + self.current_voicepack = self.voicepacks.get(self.current_voice_name, None) + + # Warm up the model if possible + self._warm_up_model() + + self.post_init() + + def post_init(self): + """ + Called after initialization. Sets the engine name. + """ + self.engine_name = "kokoro" + + def _load_voices(self, voice_names: list): + """ + Loads all specified voice .pt files into memory and stores them in a dict. + """ + for voice_name in voice_names: + try: + path = os.path.join(self.voicepacks_dir, f"{voice_name}.pt") + if not os.path.exists(path): + path = os.path.join(self.kokoro_root, path) + voicepack = torch.load(path, weights_only=True).to(self.device) + self.voicepacks[voice_name] = voicepack + if self.debug: + print(f"Loaded Kokoro voice: {voice_name}") + except Exception as e: + print(f"Failed to load voice {voice_name} from {path}. Error: {e}") + + def _warm_up_model(self): + from kokoro import generate # Kokoro-specific import + + """ + Runs a quick, minimal synthesis to get everything ready. + """ + if self.current_voicepack is None: + print("No voice is currently set. Skipping model warm-up.") + return + + warm_text = "Hello world." + if self.debug: + print(f"Warming up model with voice: {self.current_voice_name}") + try: + # We only care that it runs without error + generate(self.model, warm_text, self.current_voicepack, lang=self.current_voice_name[0]) + if self.debug: + print("Kokoro model warm-up completed.") + except Exception as e: + print(f"Warning: Model warm-up failed. {e}") + + def get_stream_info(self): + """ + Returns PyAudio stream configuration for Kokoro audio. + + Returns: + tuple: (format, channels, rate) + """ + # Kokoro examples use a 24 kHz sample rate + return (pyaudio.paInt16, 1, 24000) + + def synthesize(self, text: str) -> bool: + """ + Synthesizes text into audio data using Kokoro. + + Args: + text (str): The text to be converted to speech. + + Returns: + bool: True if successful, False otherwise. + """ + from kokoro import generate # Kokoro-specific import + + if self.current_voicepack is None: + print("No valid voice is currently set.") + return False + + start_time = time.time() + + try: + # The lang argument is just the first character of the voice name in the example + lang_code = self.current_voice_name[0] if self.current_voice_name else "a" + + # Generate float32 audio with Kokoro (assumption) + audio_float32, _ = generate(self.model, text, self.current_voicepack, lang=lang_code) + + # Convert to int16 for playback + audio_int16 = (audio_float32 * 32767).astype(np.int16).tobytes() + + # Put the audio in our queue + self.queue.put(audio_int16) + + if self.debug: + end_time = time.time() + print(f"Synthesis complete in {end_time - start_time:.3f}s.") + + return True + + except Exception as e: + print(f"Error generating audio: {e}") + return False + + def set_voice(self, voice_name: str): + """ + Sets the voice used for speech synthesis to one of the loaded voicepacks. + + Args: + voice_name (str): The name of the voice pack (e.g., 'af_sarah'). + """ + if voice_name in self.voicepacks: + self.current_voice_name = voice_name + self.current_voicepack = self.voicepacks[voice_name] + if self.debug: + print(f"Voice set to {voice_name}") + else: + print(f"Voice '{voice_name}' not found in loaded voicepacks.") + + def get_voices(self): + """ + Returns a list of loaded voice names. + + Returns: + list[str]: The loaded voice names. + """ + return list(self.voicepacks.keys()) + + def shutdown(self): + """ + Cleans up any resources used by KokoroEngine. + """ + # If there's anything to release or finalize, do it here. + pass diff --git a/setup.py b/setup.py index ce775dd1..65a1bd57 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -current_version = "0.4.40" +current_version = "0.4.41" import setuptools diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/kokoro_test.py b/tests/kokoro_test.py new file mode 100644 index 00000000..103ef05b --- /dev/null +++ b/tests/kokoro_test.py @@ -0,0 +1,47 @@ +if __name__ == "__main__": + from RealtimeTTS import TextToAudioStream, KokoroEngine + + def dummy_generator(): + yield "This is the first voice model speaking. " + yield "The elegance of the style and its flow is simply captivating. " + yield "We’ll soon switch to another voice model. " + + def dummy_generator_2(): + yield "And here we are! " + yield "You’re now listening to the second voice model, with a different style and tone. " + yield "It’s fascinating how Kokoro can adapt seamlessly. " + + # Adjust these paths to your local setup + kokoro_root = "D:/Dev/Kokoro/Kokoro-82M" + + # Initialize the engine with the first voice + engine = KokoroEngine( + kokoro_root=kokoro_root, + ) + + # Create a TextToAudioStream with the engine + stream = TextToAudioStream(engine) + + # Play with the first model + print("Playing with the first model...") + stream.feed(dummy_generator()) + stream.play(log_synthesized_text=True) + + engine.set_voice("af_sky") + # Pick one of: + # "af_nicole", + # "af", + # "af_bella", + # "af_sarah", + # "am_adam", + # "am_michael", + # "bf_emma", + # "bf_isabella", + # "bm_george", + # "bm_lewis", + # "af_sky" + stream.feed(dummy_generator_2()) + stream.play(log_synthesized_text=True) + + # Shutdown the engine + engine.shutdown()