Skip to content

Commit

Permalink
kokoro support
Browse files Browse the repository at this point in the history
  • Loading branch information
KoljaB committed Jan 11, 2025
1 parent 80d1f9c commit f8f7019
Show file tree
Hide file tree
Showing 7 changed files with 278 additions and 3 deletions.
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ https://github.com/KoljaB/RealtimeTTS/assets/7604638/87dcd9a5-3a4e-4f57-be45-837
- **High-Quality Audio**
- generates clear and natural-sounding speech
- **Multiple TTS Engine Support**
- supports OpenAI TTS, Elevenlabs, Azure Speech Services, Coqui TTS, StyleTTS2, Piper, gTTS, Edge TTS, Parler TTS and System TTS
- supports OpenAI TTS, Elevenlabs, Azure Speech Services, Coqui TTS, StyleTTS2, Piper, gTTS, Edge TTS, Parler TTS, Kokoro and System TTS
- **Multilingual**
- **Robust and Reliable**:
- ensures continuous operation through a fallback mechanism
Expand Down Expand Up @@ -58,7 +58,11 @@ Let me know if you need any adjustments or additional languages!

## Updates

Latest Version: v0.4.40
Latest Version: v0.4.41

- **New Engine:** KokoroEngine
- **Installation Tutorial:** [Usage on Huggingface](https://huggingface.co/hexgrad/Kokoro-82M#usage)
- **Test File Example:** [kokoro_test.py](https://github.com/KoljaB/RealtimeTTS/blob/master/tests/kokoro_test.py)

- **New Engine:** PiperEngine
- **Installation Tutorial:** [Watch on YouTube](https://www.youtube.com/watch?v=GGvdq3giiTQ)
Expand Down
5 changes: 5 additions & 0 deletions RealtimeTTS/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,8 @@
from .engines import PiperEngine, PiperVoice # noqa: F401
except ImportError as e:
PiperEngine, PiperVoice = None

try:
from .engines import KokoroEngine # noqa: F401
except ImportError as e:
KokoroEngine = None
5 changes: 5 additions & 0 deletions RealtimeTTS/engines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,8 @@
from .piper_engine import PiperEngine, PiperVoice # noqa: F401
except ImportError as e:
PiperEngine, PiperVoice = None

try:
from .kokoro_engine import KokoroEngine # noqa: F401
except ImportError as e:
KokoroEngine = None
214 changes: 214 additions & 0 deletions RealtimeTTS/engines/kokoro_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
"""
Needs:
- pip install munch
"""
from .base_engine import BaseEngine
from typing import Optional, Dict
from queue import Queue
import pyaudio
import torch
import time
import sys
import os

# You may need these if you plan to write WAV files or do numeric conversions
import numpy as np
# from scipy.io.wavfile import write # Uncomment if you want to save WAV files

class KokoroEngine(BaseEngine):
"""
A simple TTS engine that uses the Kokoro model for voice synthesis.
Loads all voices on init, allows setting a current voice, and generates audio.
"""

def __init__(
self,
kokoro_root: str,
model_path: str = "kokoro-v0_19.pth",
voice_names: Optional[list] = None,
voices_dir: str = "voices",
debug: bool = False
):
"""
Initializes the Kokoro text-to-speech engine.
Args:
model_path (str): Path to the Kokoro model checkpoint.
voice_names (list): List of voice names you want to load. Defaults to a set of known voices.
voices_dir (str): Directory where voice .pt files are stored.
debug (bool): If True, prints debug info.
"""
super().__init__() # Ensure BaseEngine is properly initialized
self.kokoro_root = kokoro_root.replace("\\", "/")

# Add the root directory to sys.path
root_directory = os.path.abspath(os.path.join(os.path.dirname(__file__), self.kokoro_root))
print(f"Adding {root_directory} to sys.path")
sys.path.append(root_directory)

self.debug = debug
self.queue = Queue() # Queue for feeding audio data to the output
self.device = "cuda" if torch.cuda.is_available() else "cpu"

# Build the main model once
from models import build_model # Kokoro-specific import
if not os.path.exists(model_path):
model_path = os.path.join(self.kokoro_root, model_path)
self.model = build_model(model_path, self.device)
if self.debug:
print(f"Kokoro model loaded from: {model_path} (device: {self.device})")

# If user didn't provide a voice list, fall back to defaults
if voice_names is None:
# This is just an example set; customize as needed
voice_names = [
"af_nicole",
"af",
"af_bella",
"af_sarah",
"am_adam",
"am_michael",
"bf_emma",
"bf_isabella",
"bm_george",
"bm_lewis",
"af_sky"
]
self.voicepacks_dir = voices_dir
self.voicepacks: Dict[str, torch.nn.Module] = {}
self._load_voices(voice_names)

# Pick the first voice as current by default (or None)
self.current_voice_name = voice_names[0] if voice_names else None
self.current_voicepack = self.voicepacks.get(self.current_voice_name, None)

# Warm up the model if possible
self._warm_up_model()

self.post_init()

def post_init(self):
"""
Called after initialization. Sets the engine name.
"""
self.engine_name = "kokoro"

def _load_voices(self, voice_names: list):
"""
Loads all specified voice .pt files into memory and stores them in a dict.
"""
for voice_name in voice_names:
try:
path = os.path.join(self.voicepacks_dir, f"{voice_name}.pt")
if not os.path.exists(path):
path = os.path.join(self.kokoro_root, path)
voicepack = torch.load(path, weights_only=True).to(self.device)
self.voicepacks[voice_name] = voicepack
if self.debug:
print(f"Loaded Kokoro voice: {voice_name}")
except Exception as e:
print(f"Failed to load voice {voice_name} from {path}. Error: {e}")

def _warm_up_model(self):
from kokoro import generate # Kokoro-specific import

"""
Runs a quick, minimal synthesis to get everything ready.
"""
if self.current_voicepack is None:
print("No voice is currently set. Skipping model warm-up.")
return

warm_text = "Hello world."
if self.debug:
print(f"Warming up model with voice: {self.current_voice_name}")
try:
# We only care that it runs without error
generate(self.model, warm_text, self.current_voicepack, lang=self.current_voice_name[0])
if self.debug:
print("Kokoro model warm-up completed.")
except Exception as e:
print(f"Warning: Model warm-up failed. {e}")

def get_stream_info(self):
"""
Returns PyAudio stream configuration for Kokoro audio.
Returns:
tuple: (format, channels, rate)
"""
# Kokoro examples use a 24 kHz sample rate
return (pyaudio.paInt16, 1, 24000)

def synthesize(self, text: str) -> bool:
"""
Synthesizes text into audio data using Kokoro.
Args:
text (str): The text to be converted to speech.
Returns:
bool: True if successful, False otherwise.
"""
from kokoro import generate # Kokoro-specific import

if self.current_voicepack is None:
print("No valid voice is currently set.")
return False

start_time = time.time()

try:
# The lang argument is just the first character of the voice name in the example
lang_code = self.current_voice_name[0] if self.current_voice_name else "a"

# Generate float32 audio with Kokoro (assumption)
audio_float32, _ = generate(self.model, text, self.current_voicepack, lang=lang_code)

# Convert to int16 for playback
audio_int16 = (audio_float32 * 32767).astype(np.int16).tobytes()

# Put the audio in our queue
self.queue.put(audio_int16)

if self.debug:
end_time = time.time()
print(f"Synthesis complete in {end_time - start_time:.3f}s.")

return True

except Exception as e:
print(f"Error generating audio: {e}")
return False

def set_voice(self, voice_name: str):
"""
Sets the voice used for speech synthesis to one of the loaded voicepacks.
Args:
voice_name (str): The name of the voice pack (e.g., 'af_sarah').
"""
if voice_name in self.voicepacks:
self.current_voice_name = voice_name
self.current_voicepack = self.voicepacks[voice_name]
if self.debug:
print(f"Voice set to {voice_name}")
else:
print(f"Voice '{voice_name}' not found in loaded voicepacks.")

def get_voices(self):
"""
Returns a list of loaded voice names.
Returns:
list[str]: The loaded voice names.
"""
return list(self.voicepacks.keys())

def shutdown(self):
"""
Cleans up any resources used by KokoroEngine.
"""
# If there's anything to release or finalize, do it here.
pass
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
current_version = "0.4.40"
current_version = "0.4.41"

import setuptools

Expand Down
Empty file added tests/__init__.py
Empty file.
47 changes: 47 additions & 0 deletions tests/kokoro_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
if __name__ == "__main__":
from RealtimeTTS import TextToAudioStream, KokoroEngine

def dummy_generator():
yield "This is the first voice model speaking. "
yield "The elegance of the style and its flow is simply captivating. "
yield "We’ll soon switch to another voice model. "

def dummy_generator_2():
yield "And here we are! "
yield "You’re now listening to the second voice model, with a different style and tone. "
yield "It’s fascinating how Kokoro can adapt seamlessly. "

# Adjust these paths to your local setup
kokoro_root = "D:/Dev/Kokoro/Kokoro-82M"

# Initialize the engine with the first voice
engine = KokoroEngine(
kokoro_root=kokoro_root,
)

# Create a TextToAudioStream with the engine
stream = TextToAudioStream(engine)

# Play with the first model
print("Playing with the first model...")
stream.feed(dummy_generator())
stream.play(log_synthesized_text=True)

engine.set_voice("af_sky")
# Pick one of:
# "af_nicole",
# "af",
# "af_bella",
# "af_sarah",
# "am_adam",
# "am_michael",
# "bf_emma",
# "bf_isabella",
# "bm_george",
# "bm_lewis",
# "af_sky"
stream.feed(dummy_generator_2())
stream.play(log_synthesized_text=True)

# Shutdown the engine
engine.shutdown()

0 comments on commit f8f7019

Please sign in to comment.