Merge pull request #5 from dangvansam/feature/voice-clone

add clone voice from local file api and cli
dangvansam · Dec 11, 2024 · 87ae2a0 · 87ae2a0
2 parents 785f501 + 8932564
commit 87ae2a0
Show file tree

Hide file tree

Showing 9 changed files with 301 additions and 75 deletions.
diff --git a/.gitignore b/.gitignore
@@ -53,5 +53,4 @@ node_modules
 pretrained-models/*
 *_pb2_grpc.py
 *_pb2.py
-poetry.lock
-web
+poetry.lock
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 <!-- # VietTTS: An Open-Source Vietnamese Text to Speech -->
 <p align="center">
-  <img src="assets/viet-tts-medium.png" style="width: 22%">
+  <img src="assets/viet-tts-medium.png" style="width: 200px">
   <h1 align="center"style="color: white; font-weight: bold; font-family:roboto"><span style="color: white; font-weight: bold; font-family:roboto">VietTTS</span>: An Open-Source Vietnamese Text to Speech</h1>
 </p>
 <p align="center">
@@ -20,11 +20,11 @@
 
 ## ⭐ Key Features
 - **TTS**: Text-to-Speech generation with any voice via prompt audio
-- **VC**: Voice Conversion (TODO)
+- **OpenAI-API-compatible**: Compatible with OpenAI's Text-to-Speech API format
 
 ## 🛠️ Installation
 
-VietTTS can be installed via either a Python installer or Docker.
+VietTTS can be installed via a Python installer (Linux only, with Windows and macOS support coming soon) or Docker.
 
 ### Python Installer
 ```bash
@@ -54,11 +54,8 @@ docker compose build
 # Run with docker-compose - will create server at: http://localhost:8298
 docker compose up -d
 
-# Run with docker run - will create server at: http://localhost:8298
+# Or run with docker run - will create server at: http://localhost:8298
 docker run -itd --gpu=alls -p 8298:8298 -v ./pretrained-models:/app/pretrained-models -n viet-tts-service viet-tts:latest viettts server --host 0.0.0.0 --port 8298
-
-# Show available voices
-docker exec viet-tts-service viettts show-voices
 ```
 
 ## 🚀 Usage
@@ -108,11 +105,14 @@ viettts --help
 # Start API Server
 viettts server --host 0.0.0.0 --port 8298
 
-# Synthesis speech from text
-viettts synthesis --text "Xin chào" --voice 0 --output test.wav
-
 # List all built-in voices
 viettts show-voices
+
+# Synthesize speech from text with built-in voices
+viettts synthesis --text "Xin chào" --voice 0 --output test.wav
+
+# Clone voice from a local audio file
+viettts synthesis --text "Xin chào" --voice Download/voice.wav --output cloned.wav
 ```
 
 ### API Client
@@ -144,14 +144,24 @@ with client.audio.speech.with_streaming_response.create(
 
 #### CURL
 ```bash
+# Get all built-in voices
+curl --location http://0.0.0.0:8298/v1/voices
+
+# OpenAI format (bult-in voices)
 curl http://localhost:8298/v1/audio/speech \
-  -H "Authorization: Bearer viet-tts" \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "tts-1",
-    "input": "Xin chào Việt Nam.",
-    "voice": "son-tung-mtp"
-  }' \
+  -H "Authorization: Bearer viet-tts" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "tts-1",
+    "input": "Xin chào Việt Nam.",
+    "voice": "son-tung-mtp"
+  }' \
+  --output speech.wav
+
+# API with voice from local file
+curl --location http://0.0.0.0:8298/v1/tts \
+  --form 'text="xin chào"' \
+  --form 'audio_file=@"/home/viettts/Downloads/voice.mp4"' \
   --output speech.wav
 ```
 

diff --git a/README_VN.md b/README_VN.md
@@ -1,5 +1,5 @@
 <p align="center">
-  <img src="assets/viet-tts-medium.png" style="width: 22%">
+  <img src="assets/viet-tts-medium.png" style="width: 200px">
   <h1 align="center" style="color: white; font-weight: bold; font-family:roboto"><span style="color: white; font-weight: bold; font-family:roboto">VietTTS</span>: Công cụ chuyển văn bản thành giọng nói tiếng Việt mã nguồn mở</h1>
 </p>
 <p align="center">
@@ -18,10 +18,10 @@
 
 ## ⭐ Tính năng nổi bật
 - **TTS**: Tổng hợp giọng nói từ văn bản với bất kỳ giọng nào qua audio mẫu
-- **VC**: Chuyển đổi giọng nói (TODO)
+- **OpenAI-API-compatible**: Tương thích với API Text to Speech OpenAI
 
 ## 🛠️ Cài đặt
-VietTTS có thể cài đặt qua trình cài đặt Python hoặc Docker.
+VietTTS có thể được cài đặt qua trình cài đặt Python (chỉ hỗ trợ Linux, Windows và macOS sẽ có trong tương lai) hoặc Docker.
 
 ### Trình cài đặt Python
 
@@ -53,9 +53,6 @@ docker compose up -d
 
 # Chạy bằng docker run - tạo server tại: http://localhost:8298
 docker run -itd --gpu=alls -p 8298:8298 -v ./pretrained-models:/app/pretrained-models -n viet-tts-service viet-tts:latest viettts server --host 0.0.0.0 --port 8298
-
-# Hiển thị danh sách giọng nói sẵn có
-docker exec viet-tts-service viettts show-voices
 ```
 
 ## 🚀 Sử dụng
@@ -109,11 +106,14 @@ viettts --help
 # Khởi động API Server
 viettts server --host 0.0.0.0 --port 8298
 
-# Tổng hợp giọng nói từ văn bản
+# Xem tất cả các giọng nói có sẵn
+viettts show-voices
+
+# Tổng hợp giọng nói từ văn bản với giọng có sẵn
 viettts synthesis --text "Xin chào" --voice 0 --output test.wav
 
-# Liệt kê tất cả các giọng nói có sẵn
-viettts show-voices
+# Sao chép giọng từ audio file bất kì
+viettts synthesis --text "Xin chào" --voice Download/voice.wav --output cloned.wav
 ```
 
 ### API Client
@@ -149,6 +149,10 @@ with client.audio.speech.with_streaming_response.create(
 
 #### CURL
 ```bash
+# Lấy danh sách giọng có sẵn
+curl --location http://0.0.0.0:8298/v1/voices
+
+# OpenAI API format
 curl http://localhost:8298/v1/audio/speech \
   -H "Authorization: Bearer viet-tts" \
   -H "Content-Type: application/json" \
@@ -158,6 +162,12 @@ curl http://localhost:8298/v1/audio/speech \
     "voice": "son-tung-mtp"
   }' \
   --output speech.wav
+
+# API với giọng từ file local
+curl --location http://0.0.0.0:8298/v1/tts \
+  --form 'text="xin chào"' \
+  --form 'audio_file=@"/home/viettts/Downloads/voice.mp4"' \
+  --output speech.wav
 ```
 
 #### Node

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.poetry]
-name = "viet-tts"
+name = "viettts"
 version = "0.1.0"
 description = "VietTTS: An Open-Source Vietnamese Text to Speech"
 authors = ["dangvansam <[email protected]>"]
@@ -8,17 +8,14 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = "^3.10"
 conformer = "0.3.2"
-deepspeed = "0.14.2"
 diffusers = "0.27.2"
 gradio = "4.32.2"
 hydra-core = "1.3.2"
 hyperpyyaml = "1.2.2"
 librosa = "0.10.2"
-networkx = "3.1"
 omegaconf = "2.3.0"
 onnx = "1.16.0"
 onnxruntime-gpu = "1.16.0"
-openai-whisper = "20231117"
 protobuf = "4.25"
 pydantic = "2.7.0"
 soundfile = "0.12.1"
@@ -29,15 +26,16 @@ wget = "3.2"
 fastapi = "0.111.0"
 fastapi-cli = "0.0.4"
 loguru = "0.7.2"
-natsort = "8.4.0"
 vinorm = "^2.0.7"
 huggingface-hub = "0.24.7"
 click = "^8.1.7"
 gunicorn = "^23.0.0"
 silero-vad = "^5.1.2"
+tiktoken = "^0.8.0"
+openai-whisper = "^20240930"
 
 [tool.poetry.scripts]
-viet-tts = "viettts.cli:cli"
+viettts = "viettts.cli:cli"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/viettts/cli.py b/viettts/cli.py
@@ -14,9 +14,9 @@
 MODEL_DIR = 'pretrained-models'
 
 @click.command('server')
-@click.option('-h', '--host', type=str, default='0.0.0.0')
-@click.option('-p', '--port', type=int, default=8298)
-@click.option('-w', '--workers', type=int, default=1)
+@click.option('-h', '--host', type=str, default='0.0.0.0', help="The host address to bind the server to. Default is '0.0.0.0'.")
+@click.option('-p', '--port', type=int, default=8298, help="The port number to bind the server to. Default is 8298.")
+@click.option('-w', '--workers', type=int, default=1, help="The number of worker processes to handle requests. Default is 1.")
 def start_server(host: str, port: int, workers: int):
     """Start API server (OpenAI TTS API compatible).
 
@@ -37,17 +37,14 @@ def start_server(host: str, port: int, workers: int):
 
 
 @click.command('synthesis')
-@click.option('-t', "--text", type=str, required=True)
-@click.option('-v', "--voice", type=str, default='1')
-@click.option('-s', "--speed", type=float, default=1)
-@click.option('-o', "--output", type=str, default='output.wav')
+@click.option('-t', "--text", type=str, required=True, help="The input text to synthesize into speech.")
+@click.option('-v', "--voice", type=str, default='1', help="The voice ID or file path to clone the voice from. Default is '1'.")
+@click.option('-s', "--speed", type=float, default=1, help="The speed multiplier for the speech. Default is 1 (normal speed).")
+@click.option('-o', "--output", type=str, default='output.wav', help="The file path to save the synthesized audio. Default is 'output.wav'.")
 def synthesis(text: str, voice: str, speed: float, output: str):
     """Synthesis audio from text and save to file.
 
-    Usage:
-        viettts synthesis --text 'Xin chào VietTTS' --voice nu-nhe-nhang --output test_nu-nhe-nhang.wav
-        viettts synthesis --text 'Chào bạn đến với Hà Nội' --voice 8 --speed 1.2 --output test_voice_8_speed_1.2.wav
-        viettts synthesis --text 'Bạn có thể sao chép giọng sẵn có' --voice Downloads/audio.wav
+    Usage: viettts synthesis --text 'Xin chào VietTTS' --voice nu-nhe-nhang --voice 8 --speed 1.2 --output test_nu-nhe-nhang.wav
     """
     logger.info("Starting synthesis")
     st = time.perf_counter()
@@ -107,7 +104,8 @@ def cli():
     """
     VietTTS CLI v0.1.0
     
-    Vietnamese Text To Speech and Voice Clone - License: Apache 2.0 - Author: <dangvansam [email protected]>
+    Vietnamese Text To Speech and Voice Clone
+    License: Apache 2.0 - Author: <dangvansam [email protected]>
     """
     pass
 

diff --git a/viettts/flow/flow.py b/viettts/flow/flow.py
@@ -10,23 +10,49 @@
 
 class MaskedDiffWithXvec(torch.nn.Module):
     def __init__(self,
-                 input_size: int = 512,
-                 output_size: int = 80,
-                 spk_embed_dim: int = 192,
-                 output_type: str = "mel",
-                 vocab_size: int = 4096,
-                 input_frame_rate: int = 50,
-                 only_mask_loss: bool = True,
-                 encoder: torch.nn.Module = None,
-                 length_regulator: torch.nn.Module = None,
-                 decoder: torch.nn.Module = None,
-                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
-                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
-                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
-                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
-                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
-                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050,
-                                        'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
+                input_size: int = 512,
+                output_size: int = 80,
+                spk_embed_dim: int = 192,
+                output_type: str = "mel",
+                vocab_size: int = 4096,
+                input_frame_rate: int = 50,
+                only_mask_loss: bool = True,
+                encoder: torch.nn.Module = None,
+                length_regulator: torch.nn.Module = None,
+                decoder: torch.nn.Module = None,
+                decoder_conf: Dict = {
+                    'in_channels': 240,
+                    'out_channel': 80,
+                    'spk_emb_dim': 80,
+                    'n_spks': 1,
+                    'cfm_params': DictConfig({
+                        'sigma_min': 1e-06,
+                        'solver': 'euler',
+                        't_scheduler': 'cosine',
+                        'training_cfg_rate': 0.2,
+                        'inference_cfg_rate': 0.7,
+                        'reg_loss_type': 'l1'
+                    }),
+                    'decoder_params': {
+                        'channels': [256, 256],
+                        'dropout': 0.0,
+                        'attention_head_dim': 64,
+                        'n_blocks': 4,
+                        'num_mid_blocks': 12,
+                        'num_heads': 8,
+                        'act_fn': 'gelu'
+                    }
+                },
+                mel_feat_conf: Dict = {
+                    'n_fft': 1024,
+                    'num_mels': 80,
+                    'sampling_rate': 22050,
+                    'hop_size': 256,
+                    'win_size': 1024,
+                    'fmin': 0,
+                    'fmax': 8000
+                }
+            ):
         super().__init__()
         self.input_size = input_size
         self.output_size = output_size