kokoro support

KoljaB · Jan 11, 2025 · f8f7019 · f8f7019
1 parent 80d1f9c
commit f8f7019
Show file tree

Hide file tree

Showing 7 changed files with 278 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ https://github.com/KoljaB/RealtimeTTS/assets/7604638/87dcd9a5-3a4e-4f57-be45-837
 - **High-Quality Audio**
   - generates clear and natural-sounding speech
 - **Multiple TTS Engine Support**
-  - supports OpenAI TTS, Elevenlabs, Azure Speech Services, Coqui TTS, StyleTTS2, Piper, gTTS, Edge TTS, Parler TTS and System TTS
+  - supports OpenAI TTS, Elevenlabs, Azure Speech Services, Coqui TTS, StyleTTS2, Piper, gTTS, Edge TTS, Parler TTS, Kokoro and System TTS
 - **Multilingual**
 - **Robust and Reliable**:
   - ensures continuous operation through a fallback mechanism
@@ -58,7 +58,11 @@ Let me know if you need any adjustments or additional languages!
 
 ## Updates
 
-Latest Version: v0.4.40
+Latest Version: v0.4.41
+
+- **New Engine:** KokoroEngine
+  - **Installation Tutorial:** [Usage on Huggingface](https://huggingface.co/hexgrad/Kokoro-82M#usage)
+  - **Test File Example:** [kokoro_test.py](https://github.com/KoljaB/RealtimeTTS/blob/master/tests/kokoro_test.py)
 
 - **New Engine:** PiperEngine
   - **Installation Tutorial:** [Watch on YouTube](https://www.youtube.com/watch?v=GGvdq3giiTQ)

diff --git a/RealtimeTTS/__init__.py b/RealtimeTTS/__init__.py
@@ -57,3 +57,8 @@
     from .engines import PiperEngine, PiperVoice  # noqa: F401
 except ImportError as e:
     PiperEngine, PiperVoice = None
+
+try:
+    from .engines import KokoroEngine  # noqa: F401
+except ImportError as e:
+    KokoroEngine = None
diff --git a/RealtimeTTS/engines/__init__.py b/RealtimeTTS/engines/__init__.py
@@ -56,3 +56,8 @@
     from .piper_engine import PiperEngine, PiperVoice  # noqa: F401
 except ImportError as e:
     PiperEngine, PiperVoice = None
+
+try:
+    from .kokoro_engine import KokoroEngine  # noqa: F401
+except ImportError as e:
+    KokoroEngine = None
diff --git a/RealtimeTTS/engines/kokoro_engine.py b/RealtimeTTS/engines/kokoro_engine.py
@@ -0,0 +1,214 @@
+"""
+Needs:
+- pip install munch
+
+"""
+from .base_engine import BaseEngine
+from typing import Optional, Dict
+from queue import Queue
+import pyaudio
+import torch
+import time
+import sys
+import os
+
+# You may need these if you plan to write WAV files or do numeric conversions
+import numpy as np
+# from scipy.io.wavfile import write  # Uncomment if you want to save WAV files
+
+class KokoroEngine(BaseEngine):
+    """
+    A simple TTS engine that uses the Kokoro model for voice synthesis.
+    Loads all voices on init, allows setting a current voice, and generates audio.
+    """
+
+    def __init__(
+        self,
+        kokoro_root: str, 
+        model_path: str = "kokoro-v0_19.pth",
+        voice_names: Optional[list] = None,
+        voices_dir: str = "voices",
+        debug: bool = False
+    ):
+        """
+        Initializes the Kokoro text-to-speech engine.
+
+        Args:
+            model_path (str): Path to the Kokoro model checkpoint.
+            voice_names (list): List of voice names you want to load. Defaults to a set of known voices.
+            voices_dir (str): Directory where voice .pt files are stored.
+            debug (bool): If True, prints debug info.
+        """
+        super().__init__()  # Ensure BaseEngine is properly initialized
+        self.kokoro_root = kokoro_root.replace("\\", "/")
+
+        # Add the root directory to sys.path
+        root_directory = os.path.abspath(os.path.join(os.path.dirname(__file__), self.kokoro_root)) 
+        print(f"Adding {root_directory} to sys.path")
+        sys.path.append(root_directory)
+
+        self.debug = debug
+        self.queue = Queue()  # Queue for feeding audio data to the output
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        # Build the main model once
+        from models import build_model  # Kokoro-specific import
+        if not os.path.exists(model_path):
+            model_path = os.path.join(self.kokoro_root, model_path)
+        self.model = build_model(model_path, self.device)
+        if self.debug:
+            print(f"Kokoro model loaded from: {model_path} (device: {self.device})")
+
+        # If user didn't provide a voice list, fall back to defaults
+        if voice_names is None:
+            # This is just an example set; customize as needed
+            voice_names = [
+                "af_nicole", 
+                "af",
+                "af_bella",
+                "af_sarah",
+                "am_adam",
+                "am_michael",
+                "bf_emma",
+                "bf_isabella",
+                "bm_george",
+                "bm_lewis",
+                "af_sky"
+            ]
+        self.voicepacks_dir = voices_dir
+        self.voicepacks: Dict[str, torch.nn.Module] = {}
+        self._load_voices(voice_names)
+
+        # Pick the first voice as current by default (or None)
+        self.current_voice_name = voice_names[0] if voice_names else None
+        self.current_voicepack = self.voicepacks.get(self.current_voice_name, None)
+
+        # Warm up the model if possible
+        self._warm_up_model()
+
+        self.post_init()
+
+    def post_init(self):
+        """
+        Called after initialization. Sets the engine name.
+        """
+        self.engine_name = "kokoro"
+
+    def _load_voices(self, voice_names: list):
+        """
+        Loads all specified voice .pt files into memory and stores them in a dict.
+        """
+        for voice_name in voice_names:
+            try:
+                path = os.path.join(self.voicepacks_dir, f"{voice_name}.pt")
+                if not os.path.exists(path):
+                    path = os.path.join(self.kokoro_root, path)
+                voicepack = torch.load(path, weights_only=True).to(self.device)
+                self.voicepacks[voice_name] = voicepack
+                if self.debug:
+                    print(f"Loaded Kokoro voice: {voice_name}")
+            except Exception as e:
+                print(f"Failed to load voice {voice_name} from {path}. Error: {e}")
+
+    def _warm_up_model(self):
+        from kokoro import generate  # Kokoro-specific import
+
+        """
+        Runs a quick, minimal synthesis to get everything ready.
+        """
+        if self.current_voicepack is None:
+            print("No voice is currently set. Skipping model warm-up.")
+            return 
+
+        warm_text = "Hello world."
+        if self.debug:
+            print(f"Warming up model with voice: {self.current_voice_name}")
+        try:
+            # We only care that it runs without error
+            generate(self.model, warm_text, self.current_voicepack, lang=self.current_voice_name[0])
+            if self.debug:
+                print("Kokoro model warm-up completed.")
+        except Exception as e:
+            print(f"Warning: Model warm-up failed. {e}")
+
+    def get_stream_info(self):
+        """
+        Returns PyAudio stream configuration for Kokoro audio.
+
+        Returns:
+            tuple: (format, channels, rate)
+        """
+        # Kokoro examples use a 24 kHz sample rate
+        return (pyaudio.paInt16, 1, 24000)
+
+    def synthesize(self, text: str) -> bool:
+        """
+        Synthesizes text into audio data using Kokoro.
+
+        Args:
+            text (str): The text to be converted to speech.
+
+        Returns:
+            bool: True if successful, False otherwise.
+        """
+        from kokoro import generate  # Kokoro-specific import
+
+        if self.current_voicepack is None:
+            print("No valid voice is currently set.")
+            return False
+
+        start_time = time.time()
+
+        try:
+            # The lang argument is just the first character of the voice name in the example
+            lang_code = self.current_voice_name[0] if self.current_voice_name else "a"
+
+            # Generate float32 audio with Kokoro (assumption)
+            audio_float32, _ = generate(self.model, text, self.current_voicepack, lang=lang_code)
+
+            # Convert to int16 for playback
+            audio_int16 = (audio_float32 * 32767).astype(np.int16).tobytes()
+
+            # Put the audio in our queue
+            self.queue.put(audio_int16)
+
+            if self.debug:
+                end_time = time.time()
+                print(f"Synthesis complete in {end_time - start_time:.3f}s.")
+
+            return True
+
+        except Exception as e:
+            print(f"Error generating audio: {e}")
+            return False
+
+    def set_voice(self, voice_name: str):
+        """
+        Sets the voice used for speech synthesis to one of the loaded voicepacks.
+
+        Args:
+            voice_name (str): The name of the voice pack (e.g., 'af_sarah').
+        """
+        if voice_name in self.voicepacks:
+            self.current_voice_name = voice_name
+            self.current_voicepack = self.voicepacks[voice_name]
+            if self.debug:
+                print(f"Voice set to {voice_name}")
+        else:
+            print(f"Voice '{voice_name}' not found in loaded voicepacks.")
+
+    def get_voices(self):
+        """
+        Returns a list of loaded voice names.
+
+        Returns:
+            list[str]: The loaded voice names.
+        """
+        return list(self.voicepacks.keys())
+
+    def shutdown(self):
+        """
+        Cleans up any resources used by KokoroEngine.
+        """
+        # If there's anything to release or finalize, do it here.
+        pass
diff --git a/setup.py b/setup.py
@@ -1,4 +1,4 @@
-current_version = "0.4.40"
+current_version = "0.4.41"
 
 import setuptools
 

diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/kokoro_test.py b/tests/kokoro_test.py
@@ -0,0 +1,47 @@
+if __name__ == "__main__":
+    from RealtimeTTS import TextToAudioStream, KokoroEngine
+
+    def dummy_generator():
+        yield "This is the first voice model speaking. "
+        yield "The elegance of the style and its flow is simply captivating. "
+        yield "We’ll soon switch to another voice model. "
+
+    def dummy_generator_2():
+        yield "And here we are! "
+        yield "You’re now listening to the second voice model, with a different style and tone. "
+        yield "It’s fascinating how Kokoro can adapt seamlessly. "
+
+    # Adjust these paths to your local setup
+    kokoro_root = "D:/Dev/Kokoro/Kokoro-82M"
+
+    # Initialize the engine with the first voice
+    engine = KokoroEngine(
+        kokoro_root=kokoro_root,
+    )
+
+    # Create a TextToAudioStream with the engine
+    stream = TextToAudioStream(engine)
+
+    # Play with the first model
+    print("Playing with the first model...")
+    stream.feed(dummy_generator())
+    stream.play(log_synthesized_text=True)
+
+    engine.set_voice("af_sky")
+    # Pick one of: 
+    # "af_nicole", 
+    # "af",
+    # "af_bella",
+    # "af_sarah",
+    # "am_adam",
+    # "am_michael",
+    # "bf_emma",
+    # "bf_isabella",
+    # "bm_george",
+    # "bm_lewis",
+    # "af_sky"    
+    stream.feed(dummy_generator_2())
+    stream.play(log_synthesized_text=True)
+
+    # Shutdown the engine
+    engine.shutdown()