diff --git a/.gitignore b/.gitignore
index d1c4380..a4199c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,5 +53,4 @@ node_modules
pretrained-models/*
*_pb2_grpc.py
*_pb2.py
-poetry.lock
-web
\ No newline at end of file
+poetry.lock
\ No newline at end of file
diff --git a/README.md b/README.md
index 601dd5f..746b71e 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-
+
VietTTS: An Open-Source Vietnamese Text to Speech
@@ -20,11 +20,11 @@
## ⭐ Key Features
- **TTS**: Text-to-Speech generation with any voice via prompt audio
-- **VC**: Voice Conversion (TODO)
+- **OpenAI-API-compatible**: Compatible with OpenAI's Text-to-Speech API format
## 🛠️ Installation
-VietTTS can be installed via either a Python installer or Docker.
+VietTTS can be installed via a Python installer (Linux only, with Windows and macOS support coming soon) or Docker.
### Python Installer
```bash
@@ -54,11 +54,8 @@ docker compose build
# Run with docker-compose - will create server at: http://localhost:8298
docker compose up -d
-# Run with docker run - will create server at: http://localhost:8298
+# Or run with docker run - will create server at: http://localhost:8298
docker run -itd --gpu=alls -p 8298:8298 -v ./pretrained-models:/app/pretrained-models -n viet-tts-service viet-tts:latest viettts server --host 0.0.0.0 --port 8298
-
-# Show available voices
-docker exec viet-tts-service viettts show-voices
```
## 🚀 Usage
@@ -108,11 +105,14 @@ viettts --help
# Start API Server
viettts server --host 0.0.0.0 --port 8298
-# Synthesis speech from text
-viettts synthesis --text "Xin chào" --voice 0 --output test.wav
-
# List all built-in voices
viettts show-voices
+
+# Synthesize speech from text with built-in voices
+viettts synthesis --text "Xin chào" --voice 0 --output test.wav
+
+# Clone voice from a local audio file
+viettts synthesis --text "Xin chào" --voice Download/voice.wav --output cloned.wav
```
### API Client
@@ -144,14 +144,24 @@ with client.audio.speech.with_streaming_response.create(
#### CURL
```bash
+# Get all built-in voices
+curl --location http://0.0.0.0:8298/v1/voices
+
+# OpenAI format (bult-in voices)
curl http://localhost:8298/v1/audio/speech \
- -H "Authorization: Bearer viet-tts" \
- -H "Content-Type: application/json" \
- -d '{
- "model": "tts-1",
- "input": "Xin chào Việt Nam.",
- "voice": "son-tung-mtp"
- }' \
+ -H "Authorization: Bearer viet-tts" \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "tts-1",
+ "input": "Xin chào Việt Nam.",
+ "voice": "son-tung-mtp"
+ }' \
+ --output speech.wav
+
+# API with voice from local file
+curl --location http://0.0.0.0:8298/v1/tts \
+ --form 'text="xin chào"' \
+ --form 'audio_file=@"/home/viettts/Downloads/voice.mp4"' \
--output speech.wav
```
diff --git a/README_VN.md b/README_VN.md
index c907c31..1cbdf26 100644
--- a/README_VN.md
+++ b/README_VN.md
@@ -1,5 +1,5 @@
-
+
VietTTS: Công cụ chuyển văn bản thành giọng nói tiếng Việt mã nguồn mở
@@ -18,10 +18,10 @@
## ⭐ Tính năng nổi bật
- **TTS**: Tổng hợp giọng nói từ văn bản với bất kỳ giọng nào qua audio mẫu
-- **VC**: Chuyển đổi giọng nói (TODO)
+- **OpenAI-API-compatible**: Tương thích với API Text to Speech OpenAI
## 🛠️ Cài đặt
-VietTTS có thể cài đặt qua trình cài đặt Python hoặc Docker.
+VietTTS có thể được cài đặt qua trình cài đặt Python (chỉ hỗ trợ Linux, Windows và macOS sẽ có trong tương lai) hoặc Docker.
### Trình cài đặt Python
@@ -53,9 +53,6 @@ docker compose up -d
# Chạy bằng docker run - tạo server tại: http://localhost:8298
docker run -itd --gpu=alls -p 8298:8298 -v ./pretrained-models:/app/pretrained-models -n viet-tts-service viet-tts:latest viettts server --host 0.0.0.0 --port 8298
-
-# Hiển thị danh sách giọng nói sẵn có
-docker exec viet-tts-service viettts show-voices
```
## 🚀 Sử dụng
@@ -109,11 +106,14 @@ viettts --help
# Khởi động API Server
viettts server --host 0.0.0.0 --port 8298
-# Tổng hợp giọng nói từ văn bản
+# Xem tất cả các giọng nói có sẵn
+viettts show-voices
+
+# Tổng hợp giọng nói từ văn bản với giọng có sẵn
viettts synthesis --text "Xin chào" --voice 0 --output test.wav
-# Liệt kê tất cả các giọng nói có sẵn
-viettts show-voices
+# Sao chép giọng từ audio file bất kì
+viettts synthesis --text "Xin chào" --voice Download/voice.wav --output cloned.wav
```
### API Client
@@ -149,6 +149,10 @@ with client.audio.speech.with_streaming_response.create(
#### CURL
```bash
+# Lấy danh sách giọng có sẵn
+curl --location http://0.0.0.0:8298/v1/voices
+
+# OpenAI API format
curl http://localhost:8298/v1/audio/speech \
-H "Authorization: Bearer viet-tts" \
-H "Content-Type: application/json" \
@@ -158,6 +162,12 @@ curl http://localhost:8298/v1/audio/speech \
"voice": "son-tung-mtp"
}' \
--output speech.wav
+
+# API với giọng từ file local
+curl --location http://0.0.0.0:8298/v1/tts \
+ --form 'text="xin chào"' \
+ --form 'audio_file=@"/home/viettts/Downloads/voice.mp4"' \
+ --output speech.wav
```
#### Node
diff --git a/pyproject.toml b/pyproject.toml
index 3189f0a..f850dcd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
[tool.poetry]
-name = "viet-tts"
+name = "viettts"
version = "0.1.0"
description = "VietTTS: An Open-Source Vietnamese Text to Speech"
authors = ["dangvansam "]
@@ -8,17 +8,14 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.10"
conformer = "0.3.2"
-deepspeed = "0.14.2"
diffusers = "0.27.2"
gradio = "4.32.2"
hydra-core = "1.3.2"
hyperpyyaml = "1.2.2"
librosa = "0.10.2"
-networkx = "3.1"
omegaconf = "2.3.0"
onnx = "1.16.0"
onnxruntime-gpu = "1.16.0"
-openai-whisper = "20231117"
protobuf = "4.25"
pydantic = "2.7.0"
soundfile = "0.12.1"
@@ -29,15 +26,16 @@ wget = "3.2"
fastapi = "0.111.0"
fastapi-cli = "0.0.4"
loguru = "0.7.2"
-natsort = "8.4.0"
vinorm = "^2.0.7"
huggingface-hub = "0.24.7"
click = "^8.1.7"
gunicorn = "^23.0.0"
silero-vad = "^5.1.2"
+tiktoken = "^0.8.0"
+openai-whisper = "^20240930"
[tool.poetry.scripts]
-viet-tts = "viettts.cli:cli"
+viettts = "viettts.cli:cli"
[build-system]
requires = ["poetry-core"]
diff --git a/viettts/cli.py b/viettts/cli.py
index e9dc288..7262d0a 100644
--- a/viettts/cli.py
+++ b/viettts/cli.py
@@ -14,9 +14,9 @@
MODEL_DIR = 'pretrained-models'
@click.command('server')
-@click.option('-h', '--host', type=str, default='0.0.0.0')
-@click.option('-p', '--port', type=int, default=8298)
-@click.option('-w', '--workers', type=int, default=1)
+@click.option('-h', '--host', type=str, default='0.0.0.0', help="The host address to bind the server to. Default is '0.0.0.0'.")
+@click.option('-p', '--port', type=int, default=8298, help="The port number to bind the server to. Default is 8298.")
+@click.option('-w', '--workers', type=int, default=1, help="The number of worker processes to handle requests. Default is 1.")
def start_server(host: str, port: int, workers: int):
"""Start API server (OpenAI TTS API compatible).
@@ -37,17 +37,14 @@ def start_server(host: str, port: int, workers: int):
@click.command('synthesis')
-@click.option('-t', "--text", type=str, required=True)
-@click.option('-v', "--voice", type=str, default='1')
-@click.option('-s', "--speed", type=float, default=1)
-@click.option('-o', "--output", type=str, default='output.wav')
+@click.option('-t', "--text", type=str, required=True, help="The input text to synthesize into speech.")
+@click.option('-v', "--voice", type=str, default='1', help="The voice ID or file path to clone the voice from. Default is '1'.")
+@click.option('-s', "--speed", type=float, default=1, help="The speed multiplier for the speech. Default is 1 (normal speed).")
+@click.option('-o', "--output", type=str, default='output.wav', help="The file path to save the synthesized audio. Default is 'output.wav'.")
def synthesis(text: str, voice: str, speed: float, output: str):
"""Synthesis audio from text and save to file.
- Usage:
- viettts synthesis --text 'Xin chào VietTTS' --voice nu-nhe-nhang --output test_nu-nhe-nhang.wav
- viettts synthesis --text 'Chào bạn đến với Hà Nội' --voice 8 --speed 1.2 --output test_voice_8_speed_1.2.wav
- viettts synthesis --text 'Bạn có thể sao chép giọng sẵn có' --voice Downloads/audio.wav
+ Usage: viettts synthesis --text 'Xin chào VietTTS' --voice nu-nhe-nhang --voice 8 --speed 1.2 --output test_nu-nhe-nhang.wav
"""
logger.info("Starting synthesis")
st = time.perf_counter()
@@ -107,7 +104,8 @@ def cli():
"""
VietTTS CLI v0.1.0
- Vietnamese Text To Speech and Voice Clone - License: Apache 2.0 - Author:
+ Vietnamese Text To Speech and Voice Clone
+ License: Apache 2.0 - Author:
"""
pass
diff --git a/viettts/flow/flow.py b/viettts/flow/flow.py
index 9408037..44375a2 100644
--- a/viettts/flow/flow.py
+++ b/viettts/flow/flow.py
@@ -10,23 +10,49 @@
class MaskedDiffWithXvec(torch.nn.Module):
def __init__(self,
- input_size: int = 512,
- output_size: int = 80,
- spk_embed_dim: int = 192,
- output_type: str = "mel",
- vocab_size: int = 4096,
- input_frame_rate: int = 50,
- only_mask_loss: bool = True,
- encoder: torch.nn.Module = None,
- length_regulator: torch.nn.Module = None,
- decoder: torch.nn.Module = None,
- decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
- 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
- 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
- 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
- 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
- mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050,
- 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
+ input_size: int = 512,
+ output_size: int = 80,
+ spk_embed_dim: int = 192,
+ output_type: str = "mel",
+ vocab_size: int = 4096,
+ input_frame_rate: int = 50,
+ only_mask_loss: bool = True,
+ encoder: torch.nn.Module = None,
+ length_regulator: torch.nn.Module = None,
+ decoder: torch.nn.Module = None,
+ decoder_conf: Dict = {
+ 'in_channels': 240,
+ 'out_channel': 80,
+ 'spk_emb_dim': 80,
+ 'n_spks': 1,
+ 'cfm_params': DictConfig({
+ 'sigma_min': 1e-06,
+ 'solver': 'euler',
+ 't_scheduler': 'cosine',
+ 'training_cfg_rate': 0.2,
+ 'inference_cfg_rate': 0.7,
+ 'reg_loss_type': 'l1'
+ }),
+ 'decoder_params': {
+ 'channels': [256, 256],
+ 'dropout': 0.0,
+ 'attention_head_dim': 64,
+ 'n_blocks': 4,
+ 'num_mid_blocks': 12,
+ 'num_heads': 8,
+ 'act_fn': 'gelu'
+ }
+ },
+ mel_feat_conf: Dict = {
+ 'n_fft': 1024,
+ 'num_mels': 80,
+ 'sampling_rate': 22050,
+ 'hop_size': 256,
+ 'win_size': 1024,
+ 'fmin': 0,
+ 'fmax': 8000
+ }
+ ):
super().__init__()
self.input_size = input_size
self.output_size = output_size
diff --git a/viettts/server.py b/viettts/server.py
index 0fdca90..6902b24 100644
--- a/viettts/server.py
+++ b/viettts/server.py
@@ -1,18 +1,23 @@
import io
import os
import queue
+import random
import subprocess
import threading
import wave
+import tempfile
+import shutil
+import requests
import numpy as np
from loguru import logger
+from datetime import datetime
from typing import Any, List, Optional
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
from anyio import CapacityLimiter
from anyio.lowlevel import RunVar
-from fastapi import FastAPI, UploadFile, Form, File
-from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse
+from fastapi import FastAPI, UploadFile, Form, File, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse, FileResponse
from fastapi.middleware.cors import CORSMiddleware
from viettts.tts import TTS
@@ -27,8 +32,12 @@
app = FastAPI(
- title="VietTTS",
- description="VietTTS API"
+ title="VietTTS API",
+ description="""
+ VietTTS API (https://github.com/dangvansam/viet-tts)
+ Vietnamese Text To Speech and Voice Clone
+ License: Apache 2.0 - Author:
+ """
)
app.add_middleware(
CORSMiddleware,
@@ -47,13 +56,17 @@ def generate_data(model_output):
yield audio
-class GenerateSpeechRequest(BaseModel):
+class OpenAITTSRequest(BaseModel):
input: str
model: str = "tts-1"
- voice: str = list(VOICE_MAP)[1]
+ voice: str = random.choice(list(VOICE_MAP))
response_format: str = "wav"
speed: float = 1.0
+class TTSRequest(BaseModel):
+ text: str
+ voice: str = random.choice(list(VOICE_MAP))
+ speed: float = 1.0
def wav_chunk_header(sample_rate=22050, bit_depth=16, channels=1):
buffer = io.BytesIO()
@@ -69,7 +82,11 @@ def wav_chunk_header(sample_rate=22050, bit_depth=16, channels=1):
@app.get("/", response_class=PlainTextResponse)
async def root():
- return 'VietTTS API Server Is Runing'
+ return 'VietTTS API'
+
+@app.get("/health", response_class=PlainTextResponse)
+async def health():
+ return 'VietTTS API is running...'
@app.get("/voices")
@app.get("/v1/voices")
@@ -78,8 +95,9 @@ async def show_voices():
@app.post("/audio/speech")
@app.post("/v1/audio/speech")
-async def tts(tts_request: GenerateSpeechRequest):
- logger.info(f"Generate speech request: {tts_request.dict()}")
+async def openai_api_tts(tts_request: OpenAITTSRequest):
+ logger.info(f"Received TTS request: {tts_request.dict()}")
+
if tts_request.voice.isdigit():
voice_file = list(VOICE_MAP.values())[int(tts_request.voice)]
else:
@@ -199,6 +217,117 @@ async def cleanup():
background=cleanup
)
+@app.post("/tts")
+@app.post("/v1/tts")
+async def tts(
+ text: str = Form(...),
+ voice: str = Form("0"),
+ speed: float = Form(1.0),
+ audio_url: str = Form(None),
+ audio_file: UploadFile = File(None)
+):
+ logger.info(f"Received TTS request: text={text}, voice={voice}, speed={speed}, audio_url={audio_url}")
+ voice_file = None
+
+ # Case 1: Uploaded audio file
+ if audio_file:
+ temp_audio_file = tempfile.NamedTemporaryFile(
+ delete=False,
+ suffix=f'.{audio_file.filename.split(".")[-1]}'
+ )
+ try:
+ with open(temp_audio_file.name, "wb") as temp_file:
+ shutil.copyfileobj(audio_file.file, temp_file)
+ voice_file = temp_audio_file.name
+ logger.info(f"Using uploaded audio file as voice: {voice_file}")
+ finally:
+ audio_file.file.close()
+
+ # Case 2: Audio URL
+ elif audio_url:
+ temp_audio_file = tempfile.NamedTemporaryFile(
+ delete=False,
+ suffix=f'.{audio_url.lower().split(".")[-1]}'
+ )
+ try:
+ response = requests.get(audio_url, stream=True)
+ if response.status_code != 200:
+ raise HTTPException(status_code=400, detail="Failed to fetch audio from URL")
+ with open(temp_audio_file.name, "wb") as temp_file:
+ shutil.copyfileobj(response.raw, temp_file)
+ voice_file = temp_audio_file.name
+ logger.info(f"Using audio URL as voice: {voice_file}")
+ finally:
+ response.close()
+
+ # Case 3: Predefined voice
+ elif voice:
+ if voice.isdigit():
+ voice_file = list(VOICE_MAP.values())[int(voice)]
+ else:
+ voice_file = VOICE_MAP.get(voice)
+
+ if not voice_file:
+ logger.error(f"Voice {voice} not found")
+ raise HTTPException(status_code=404, detail="Voice not found")
+
+ else:
+ voice_file = random.choice(list(VOICE_MAP.values()))
+
+ # Error if no voice file is available
+ if not voice_file or not os.path.exists(voice_file):
+ raise HTTPException(status_code=400, detail="No valid voice file provided")
+
+ prompt_speech_16k = load_prompt_speech_from_file(
+ filepath=voice_file,
+ min_duration=3,
+ max_duration=5
+ )
+
+ temp_output_file = tempfile.NamedTemporaryFile(
+ delete=False,
+ suffix=f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
+ )
+
+ try:
+ model_output = tts_obj.inference_tts(
+ tts_text=text,
+ prompt_speech_16k=prompt_speech_16k,
+ speed=speed,
+ stream=False
+ )
+
+ raw_audio = b''.join(chunk['tts_speech'].numpy().tobytes() for chunk in model_output)
+ ffmpeg_args = [
+ "ffmpeg", "-loglevel", "error", "-y", "-f", "f32le", "-ar", "24000", "-ac", "1",
+ "-i", "-", "-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k", temp_output_file.name
+ ]
+ ffmpeg_proc = subprocess.run(
+ ffmpeg_args,
+ input=raw_audio,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE
+ )
+
+ if ffmpeg_proc.returncode != 0:
+ logger.error(f"FFmpeg error: {ffmpeg_proc.stderr.decode()}")
+ raise HTTPException(status_code=500, detail="Error during audio processing")
+
+ if not os.path.exists(temp_output_file.name):
+ logger.error(f"FFmpeg did not create the output file: {temp_output_file.name}")
+ raise HTTPException(status_code=500, detail="FFmpeg failed to produce the output file")
+
+ return FileResponse(
+ path=temp_output_file.name,
+ media_type="audio/mpeg",
+ filename=temp_output_file.name.split("/")[-1]
+ )
+
+ finally:
+ if audio_file or audio_url:
+ if os.path.exists(temp_audio_file.name):
+ os.unlink(temp_audio_file.name)
+
@app.on_event("startup")
async def startup():
diff --git a/viettts/utils/file_utils.py b/viettts/utils/file_utils.py
index b1d91f9..66d2a27 100644
--- a/viettts/utils/file_utils.py
+++ b/viettts/utils/file_utils.py
@@ -1,19 +1,75 @@
import os
+import subprocess
import torchaudio
import soundfile
import numpy as np
from glob import glob
+from loguru import logger
from huggingface_hub import snapshot_download
from viettts.utils.vad import get_speech
+import torchaudio
+import os
+import subprocess
+import tempfile
+
+
+def convert_to_wav(input_filepath: str, target_sr: int) -> str:
+ """
+ Convert an input audio file to WAV format with the desired sample rate using FFmpeg.
+
+ Args:
+ input_filepath (str): Path to the input audio file.
+ target_sr (int): Target sample rate.
+
+ Returns:
+ str: Path to the converted WAV file.
+ """
+ temp_wav_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+ temp_wav_filepath = temp_wav_file.name
+ temp_wav_file.close()
+
+ ffmpeg_command = [
+ "ffmpeg", "-y",
+ "-loglevel", "error",
+ "-i", input_filepath,
+ "-ar", str(target_sr),
+ "-ac", "1",
+ temp_wav_filepath
+ ]
+
+ result = subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ if result.returncode != 0:
+ os.unlink(temp_wav_filepath)
+ raise RuntimeError(f"FFmpeg conversion failed: {result.stderr.decode()}")
+
+ return temp_wav_filepath
+
def load_wav(filepath: str, target_sr: int):
+ """
+ Load an audio file in any supported format, convert it to WAV, and load as a tensor.
+
+ Args:
+ filepath (str): Path to the audio file in any format.
+ target_sr (int): Target sample rate.
+
+ Returns:
+ Tensor: Loaded audio tensor resampled to the target sample rate.
+ """
+ # Check if the file is already in WAV format
+ if not filepath.lower().endswith(".wav"):
+ logger.info(f"Converting {filepath} to WAV format")
+ filepath = convert_to_wav(filepath, target_sr)
+
+ # Load the WAV file
speech, sample_rate = torchaudio.load(filepath)
- speech = speech.mean(dim=0, keepdim=True)
+ speech = speech.mean(dim=0, keepdim=True) # Convert to mono if not already
if sample_rate != target_sr:
- assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
+ assert sample_rate > target_sr, f'WAV sample rate {sample_rate} must be greater than {target_sr}'
speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
+
return speech
diff --git a/web/.gitkeep b/web/.gitkeep
new file mode 100644
index 0000000..e69de29