diff --git a/.gitignore b/.gitignore index d1c4380..a4199c3 100644 --- a/.gitignore +++ b/.gitignore @@ -53,5 +53,4 @@ node_modules pretrained-models/* *_pb2_grpc.py *_pb2.py -poetry.lock -web \ No newline at end of file +poetry.lock \ No newline at end of file diff --git a/README.md b/README.md index 601dd5f..746b71e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@

- +

VietTTS: An Open-Source Vietnamese Text to Speech

@@ -20,11 +20,11 @@ ## ⭐ Key Features - **TTS**: Text-to-Speech generation with any voice via prompt audio -- **VC**: Voice Conversion (TODO) +- **OpenAI-API-compatible**: Compatible with OpenAI's Text-to-Speech API format ## 🛠️ Installation -VietTTS can be installed via either a Python installer or Docker. +VietTTS can be installed via a Python installer (Linux only, with Windows and macOS support coming soon) or Docker. ### Python Installer ```bash @@ -54,11 +54,8 @@ docker compose build # Run with docker-compose - will create server at: http://localhost:8298 docker compose up -d -# Run with docker run - will create server at: http://localhost:8298 +# Or run with docker run - will create server at: http://localhost:8298 docker run -itd --gpu=alls -p 8298:8298 -v ./pretrained-models:/app/pretrained-models -n viet-tts-service viet-tts:latest viettts server --host 0.0.0.0 --port 8298 - -# Show available voices -docker exec viet-tts-service viettts show-voices ``` ## 🚀 Usage @@ -108,11 +105,14 @@ viettts --help # Start API Server viettts server --host 0.0.0.0 --port 8298 -# Synthesis speech from text -viettts synthesis --text "Xin chào" --voice 0 --output test.wav - # List all built-in voices viettts show-voices + +# Synthesize speech from text with built-in voices +viettts synthesis --text "Xin chào" --voice 0 --output test.wav + +# Clone voice from a local audio file +viettts synthesis --text "Xin chào" --voice Download/voice.wav --output cloned.wav ``` ### API Client @@ -144,14 +144,24 @@ with client.audio.speech.with_streaming_response.create( #### CURL ```bash +# Get all built-in voices +curl --location http://0.0.0.0:8298/v1/voices + +# OpenAI format (bult-in voices) curl http://localhost:8298/v1/audio/speech \ - -H "Authorization: Bearer viet-tts" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "tts-1", - "input": "Xin chào Việt Nam.", - "voice": "son-tung-mtp" - }' \ +  -H "Authorization: Bearer viet-tts" \ +  -H "Content-Type: application/json" \ +  -d '{ +    "model": "tts-1", +    "input": "Xin chào Việt Nam.", +    "voice": "son-tung-mtp" +  }' \ +  --output speech.wav + +# API with voice from local file +curl --location http://0.0.0.0:8298/v1/tts \ + --form 'text="xin chào"' \ + --form 'audio_file=@"/home/viettts/Downloads/voice.mp4"' \ --output speech.wav ``` diff --git a/README_VN.md b/README_VN.md index c907c31..1cbdf26 100644 --- a/README_VN.md +++ b/README_VN.md @@ -1,5 +1,5 @@

+  

VietTTS: Công cụ chuyển văn bản thành giọng nói tiếng Việt mã nguồn mở

@@ -18,10 +18,10 @@ ## ⭐ Tính năng nổi bật - **TTS**: Tổng hợp giọng nói từ văn bản với bất kỳ giọng nào qua audio mẫu -- **VC**: Chuyển đổi giọng nói (TODO) +- **OpenAI-API-compatible**: Tương thích với API Text to Speech OpenAI ## 🛠️ Cài đặt -VietTTS có thể cài đặt qua trình cài đặt Python hoặc Docker. +VietTTS có thể được cài đặt qua trình cài đặt Python (chỉ hỗ trợ Linux, Windows và macOS sẽ có trong tương lai) hoặc Docker. ### Trình cài đặt Python @@ -53,9 +53,6 @@ docker compose up -d # Chạy bằng docker run - tạo server tại: http://localhost:8298 docker run -itd --gpu=alls -p 8298:8298 -v ./pretrained-models:/app/pretrained-models -n viet-tts-service viet-tts:latest viettts server --host 0.0.0.0 --port 8298 - -# Hiển thị danh sách giọng nói sẵn có -docker exec viet-tts-service viettts show-voices ``` ## 🚀 Sử dụng @@ -109,11 +106,14 @@ viettts --help # Khởi động API Server viettts server --host 0.0.0.0 --port 8298 -# Tổng hợp giọng nói từ văn bản +# Xem tất cả các giọng nói có sẵn +viettts show-voices + +# Tổng hợp giọng nói từ văn bản với giọng có sẵn viettts synthesis --text "Xin chào" --voice 0 --output test.wav -# Liệt kê tất cả các giọng nói có sẵn -viettts show-voices +# Sao chép giọng từ audio file bất kì +viettts synthesis --text "Xin chào" --voice Download/voice.wav --output cloned.wav ``` ### API Client @@ -149,6 +149,10 @@ with client.audio.speech.with_streaming_response.create( #### CURL ```bash +# Lấy danh sách giọng có sẵn +curl --location http://0.0.0.0:8298/v1/voices + +# OpenAI API format curl http://localhost:8298/v1/audio/speech \   -H "Authorization: Bearer viet-tts" \   -H "Content-Type: application/json" \ @@ -158,6 +162,12 @@ curl http://localhost:8298/v1/audio/speech \     "voice": "son-tung-mtp"   }' \   --output speech.wav + +# API với giọng từ file local +curl --location http://0.0.0.0:8298/v1/tts \ + --form 'text="xin chào"' \ + --form 'audio_file=@"/home/viettts/Downloads/voice.mp4"' \ + --output speech.wav ``` #### Node diff --git a/pyproject.toml b/pyproject.toml index 3189f0a..f850dcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "viet-tts" +name = "viettts" version = "0.1.0" description = "VietTTS: An Open-Source Vietnamese Text to Speech" authors = ["dangvansam "] @@ -8,17 +8,14 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.10" conformer = "0.3.2" -deepspeed = "0.14.2" diffusers = "0.27.2" gradio = "4.32.2" hydra-core = "1.3.2" hyperpyyaml = "1.2.2" librosa = "0.10.2" -networkx = "3.1" omegaconf = "2.3.0" onnx = "1.16.0" onnxruntime-gpu = "1.16.0" -openai-whisper = "20231117" protobuf = "4.25" pydantic = "2.7.0" soundfile = "0.12.1" @@ -29,15 +26,16 @@ wget = "3.2" fastapi = "0.111.0" fastapi-cli = "0.0.4" loguru = "0.7.2" -natsort = "8.4.0" vinorm = "^2.0.7" huggingface-hub = "0.24.7" click = "^8.1.7" gunicorn = "^23.0.0" silero-vad = "^5.1.2" +tiktoken = "^0.8.0" +openai-whisper = "^20240930" [tool.poetry.scripts] -viet-tts = "viettts.cli:cli" +viettts = "viettts.cli:cli" [build-system] requires = ["poetry-core"] diff --git a/viettts/cli.py b/viettts/cli.py index e9dc288..7262d0a 100644 --- a/viettts/cli.py +++ b/viettts/cli.py @@ -14,9 +14,9 @@ MODEL_DIR = 'pretrained-models' @click.command('server') -@click.option('-h', '--host', type=str, default='0.0.0.0') -@click.option('-p', '--port', type=int, default=8298) -@click.option('-w', '--workers', type=int, default=1) +@click.option('-h', '--host', type=str, default='0.0.0.0', help="The host address to bind the server to. Default is '0.0.0.0'.") +@click.option('-p', '--port', type=int, default=8298, help="The port number to bind the server to. Default is 8298.") +@click.option('-w', '--workers', type=int, default=1, help="The number of worker processes to handle requests. Default is 1.") def start_server(host: str, port: int, workers: int): """Start API server (OpenAI TTS API compatible). @@ -37,17 +37,14 @@ def start_server(host: str, port: int, workers: int): @click.command('synthesis') -@click.option('-t', "--text", type=str, required=True) -@click.option('-v', "--voice", type=str, default='1') -@click.option('-s', "--speed", type=float, default=1) -@click.option('-o', "--output", type=str, default='output.wav') +@click.option('-t', "--text", type=str, required=True, help="The input text to synthesize into speech.") +@click.option('-v', "--voice", type=str, default='1', help="The voice ID or file path to clone the voice from. Default is '1'.") +@click.option('-s', "--speed", type=float, default=1, help="The speed multiplier for the speech. Default is 1 (normal speed).") +@click.option('-o', "--output", type=str, default='output.wav', help="The file path to save the synthesized audio. Default is 'output.wav'.") def synthesis(text: str, voice: str, speed: float, output: str): """Synthesis audio from text and save to file. - Usage: - viettts synthesis --text 'Xin chào VietTTS' --voice nu-nhe-nhang --output test_nu-nhe-nhang.wav - viettts synthesis --text 'Chào bạn đến với Hà Nội' --voice 8 --speed 1.2 --output test_voice_8_speed_1.2.wav - viettts synthesis --text 'Bạn có thể sao chép giọng sẵn có' --voice Downloads/audio.wav + Usage: viettts synthesis --text 'Xin chào VietTTS' --voice nu-nhe-nhang --voice 8 --speed 1.2 --output test_nu-nhe-nhang.wav """ logger.info("Starting synthesis") st = time.perf_counter() @@ -107,7 +104,8 @@ def cli(): """ VietTTS CLI v0.1.0 - Vietnamese Text To Speech and Voice Clone - License: Apache 2.0 - Author: + Vietnamese Text To Speech and Voice Clone + License: Apache 2.0 - Author: """ pass diff --git a/viettts/flow/flow.py b/viettts/flow/flow.py index 9408037..44375a2 100644 --- a/viettts/flow/flow.py +++ b/viettts/flow/flow.py @@ -10,23 +10,49 @@ class MaskedDiffWithXvec(torch.nn.Module): def __init__(self, - input_size: int = 512, - output_size: int = 80, - spk_embed_dim: int = 192, - output_type: str = "mel", - vocab_size: int = 4096, - input_frame_rate: int = 50, - only_mask_loss: bool = True, - encoder: torch.nn.Module = None, - length_regulator: torch.nn.Module = None, - decoder: torch.nn.Module = None, - decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, - 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', - 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), - 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, - 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}}, - mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, - 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}): + input_size: int = 512, + output_size: int = 80, + spk_embed_dim: int = 192, + output_type: str = "mel", + vocab_size: int = 4096, + input_frame_rate: int = 50, + only_mask_loss: bool = True, + encoder: torch.nn.Module = None, + length_regulator: torch.nn.Module = None, + decoder: torch.nn.Module = None, + decoder_conf: Dict = { + 'in_channels': 240, + 'out_channel': 80, + 'spk_emb_dim': 80, + 'n_spks': 1, + 'cfm_params': DictConfig({ + 'sigma_min': 1e-06, + 'solver': 'euler', + 't_scheduler': 'cosine', + 'training_cfg_rate': 0.2, + 'inference_cfg_rate': 0.7, + 'reg_loss_type': 'l1' + }), + 'decoder_params': { + 'channels': [256, 256], + 'dropout': 0.0, + 'attention_head_dim': 64, + 'n_blocks': 4, + 'num_mid_blocks': 12, + 'num_heads': 8, + 'act_fn': 'gelu' + } + }, + mel_feat_conf: Dict = { + 'n_fft': 1024, + 'num_mels': 80, + 'sampling_rate': 22050, + 'hop_size': 256, + 'win_size': 1024, + 'fmin': 0, + 'fmax': 8000 + } + ): super().__init__() self.input_size = input_size self.output_size = output_size diff --git a/viettts/server.py b/viettts/server.py index 0fdca90..6902b24 100644 --- a/viettts/server.py +++ b/viettts/server.py @@ -1,18 +1,23 @@ import io import os import queue +import random import subprocess import threading import wave +import tempfile +import shutil +import requests import numpy as np from loguru import logger +from datetime import datetime from typing import Any, List, Optional -from pydantic import BaseModel, Field +from pydantic import BaseModel from anyio import CapacityLimiter from anyio.lowlevel import RunVar -from fastapi import FastAPI, UploadFile, Form, File -from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse +from fastapi import FastAPI, UploadFile, Form, File, HTTPException +from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse, FileResponse from fastapi.middleware.cors import CORSMiddleware from viettts.tts import TTS @@ -27,8 +32,12 @@ app = FastAPI( - title="VietTTS", - description="VietTTS API" + title="VietTTS API", + description=""" + VietTTS API (https://github.com/dangvansam/viet-tts) + Vietnamese Text To Speech and Voice Clone + License: Apache 2.0 - Author: + """ ) app.add_middleware( CORSMiddleware, @@ -47,13 +56,17 @@ def generate_data(model_output): yield audio -class GenerateSpeechRequest(BaseModel): +class OpenAITTSRequest(BaseModel): input: str model: str = "tts-1" - voice: str = list(VOICE_MAP)[1] + voice: str = random.choice(list(VOICE_MAP)) response_format: str = "wav" speed: float = 1.0 +class TTSRequest(BaseModel): + text: str + voice: str = random.choice(list(VOICE_MAP)) + speed: float = 1.0 def wav_chunk_header(sample_rate=22050, bit_depth=16, channels=1): buffer = io.BytesIO() @@ -69,7 +82,11 @@ def wav_chunk_header(sample_rate=22050, bit_depth=16, channels=1): @app.get("/", response_class=PlainTextResponse) async def root(): - return 'VietTTS API Server Is Runing' + return 'VietTTS API' + +@app.get("/health", response_class=PlainTextResponse) +async def health(): + return 'VietTTS API is running...' @app.get("/voices") @app.get("/v1/voices") @@ -78,8 +95,9 @@ async def show_voices(): @app.post("/audio/speech") @app.post("/v1/audio/speech") -async def tts(tts_request: GenerateSpeechRequest): - logger.info(f"Generate speech request: {tts_request.dict()}") +async def openai_api_tts(tts_request: OpenAITTSRequest): + logger.info(f"Received TTS request: {tts_request.dict()}") + if tts_request.voice.isdigit(): voice_file = list(VOICE_MAP.values())[int(tts_request.voice)] else: @@ -199,6 +217,117 @@ async def cleanup(): background=cleanup ) +@app.post("/tts") +@app.post("/v1/tts") +async def tts( + text: str = Form(...), + voice: str = Form("0"), + speed: float = Form(1.0), + audio_url: str = Form(None), + audio_file: UploadFile = File(None) +): + logger.info(f"Received TTS request: text={text}, voice={voice}, speed={speed}, audio_url={audio_url}") + voice_file = None + + # Case 1: Uploaded audio file + if audio_file: + temp_audio_file = tempfile.NamedTemporaryFile( + delete=False, + suffix=f'.{audio_file.filename.split(".")[-1]}' + ) + try: + with open(temp_audio_file.name, "wb") as temp_file: + shutil.copyfileobj(audio_file.file, temp_file) + voice_file = temp_audio_file.name + logger.info(f"Using uploaded audio file as voice: {voice_file}") + finally: + audio_file.file.close() + + # Case 2: Audio URL + elif audio_url: + temp_audio_file = tempfile.NamedTemporaryFile( + delete=False, + suffix=f'.{audio_url.lower().split(".")[-1]}' + ) + try: + response = requests.get(audio_url, stream=True) + if response.status_code != 200: + raise HTTPException(status_code=400, detail="Failed to fetch audio from URL") + with open(temp_audio_file.name, "wb") as temp_file: + shutil.copyfileobj(response.raw, temp_file) + voice_file = temp_audio_file.name + logger.info(f"Using audio URL as voice: {voice_file}") + finally: + response.close() + + # Case 3: Predefined voice + elif voice: + if voice.isdigit(): + voice_file = list(VOICE_MAP.values())[int(voice)] + else: + voice_file = VOICE_MAP.get(voice) + + if not voice_file: + logger.error(f"Voice {voice} not found") + raise HTTPException(status_code=404, detail="Voice not found") + + else: + voice_file = random.choice(list(VOICE_MAP.values())) + + # Error if no voice file is available + if not voice_file or not os.path.exists(voice_file): + raise HTTPException(status_code=400, detail="No valid voice file provided") + + prompt_speech_16k = load_prompt_speech_from_file( + filepath=voice_file, + min_duration=3, + max_duration=5 + ) + + temp_output_file = tempfile.NamedTemporaryFile( + delete=False, + suffix=f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3" + ) + + try: + model_output = tts_obj.inference_tts( + tts_text=text, + prompt_speech_16k=prompt_speech_16k, + speed=speed, + stream=False + ) + + raw_audio = b''.join(chunk['tts_speech'].numpy().tobytes() for chunk in model_output) + ffmpeg_args = [ + "ffmpeg", "-loglevel", "error", "-y", "-f", "f32le", "-ar", "24000", "-ac", "1", + "-i", "-", "-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k", temp_output_file.name + ] + ffmpeg_proc = subprocess.run( + ffmpeg_args, + input=raw_audio, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + + if ffmpeg_proc.returncode != 0: + logger.error(f"FFmpeg error: {ffmpeg_proc.stderr.decode()}") + raise HTTPException(status_code=500, detail="Error during audio processing") + + if not os.path.exists(temp_output_file.name): + logger.error(f"FFmpeg did not create the output file: {temp_output_file.name}") + raise HTTPException(status_code=500, detail="FFmpeg failed to produce the output file") + + return FileResponse( + path=temp_output_file.name, + media_type="audio/mpeg", + filename=temp_output_file.name.split("/")[-1] + ) + + finally: + if audio_file or audio_url: + if os.path.exists(temp_audio_file.name): + os.unlink(temp_audio_file.name) + @app.on_event("startup") async def startup(): diff --git a/viettts/utils/file_utils.py b/viettts/utils/file_utils.py index b1d91f9..66d2a27 100644 --- a/viettts/utils/file_utils.py +++ b/viettts/utils/file_utils.py @@ -1,19 +1,75 @@ import os +import subprocess import torchaudio import soundfile import numpy as np from glob import glob +from loguru import logger from huggingface_hub import snapshot_download from viettts.utils.vad import get_speech +import torchaudio +import os +import subprocess +import tempfile + + +def convert_to_wav(input_filepath: str, target_sr: int) -> str: + """ + Convert an input audio file to WAV format with the desired sample rate using FFmpeg. + + Args: + input_filepath (str): Path to the input audio file. + target_sr (int): Target sample rate. + + Returns: + str: Path to the converted WAV file. + """ + temp_wav_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") + temp_wav_filepath = temp_wav_file.name + temp_wav_file.close() + + ffmpeg_command = [ + "ffmpeg", "-y", + "-loglevel", "error", + "-i", input_filepath, + "-ar", str(target_sr), + "-ac", "1", + temp_wav_filepath + ] + + result = subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if result.returncode != 0: + os.unlink(temp_wav_filepath) + raise RuntimeError(f"FFmpeg conversion failed: {result.stderr.decode()}") + + return temp_wav_filepath + def load_wav(filepath: str, target_sr: int): + """ + Load an audio file in any supported format, convert it to WAV, and load as a tensor. + + Args: + filepath (str): Path to the audio file in any format. + target_sr (int): Target sample rate. + + Returns: + Tensor: Loaded audio tensor resampled to the target sample rate. + """ + # Check if the file is already in WAV format + if not filepath.lower().endswith(".wav"): + logger.info(f"Converting {filepath} to WAV format") + filepath = convert_to_wav(filepath, target_sr) + + # Load the WAV file speech, sample_rate = torchaudio.load(filepath) - speech = speech.mean(dim=0, keepdim=True) + speech = speech.mean(dim=0, keepdim=True) # Convert to mono if not already if sample_rate != target_sr: - assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr) + assert sample_rate > target_sr, f'WAV sample rate {sample_rate} must be greater than {target_sr}' speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech) + return speech diff --git a/web/.gitkeep b/web/.gitkeep new file mode 100644 index 0000000..e69de29