From 78951e9ffae3685cf171247326276239f443ed99 Mon Sep 17 00:00:00 2001
From: Andrew <andrew@techainer.com>
Date: Mon, 2 Dec 2024 18:32:36 +0700
Subject: [PATCH 1/7] update readme, pyproject and add voice clone api

---
 README.md         |   5 +-
 README_VN.md      |   3 -
 pyproject.toml    |   8 +--
 viettts/cli.py    |   3 +-
 viettts/server.py | 144 ++++++++++++++++++++++++++++++++++++++++++----
 5 files changed, 139 insertions(+), 24 deletions(-)
diff --git a/README.md b/README.md
index 601dd5f..6b107c7 100644
--- a/README.md
+++ b/README.md
@@ -54,11 +54,8 @@ docker compose build
 # Run with docker-compose - will create server at: http://localhost:8298
 docker compose up -d
 
-# Run with docker run - will create server at: http://localhost:8298
+# Or run with docker run - will create server at: http://localhost:8298
 docker run -itd --gpu=alls -p 8298:8298 -v ./pretrained-models:/app/pretrained-models -n viet-tts-service viet-tts:latest viettts server --host 0.0.0.0 --port 8298
-
-# Show available voices
-docker exec viet-tts-service viettts show-voices
 ```
 
 ## 🚀 Usage
diff --git a/README_VN.md b/README_VN.md
index c907c31..f4f9f12 100644
--- a/README_VN.md
+++ b/README_VN.md
@@ -53,9 +53,6 @@ docker compose up -d
 
 # Chạy bằng docker run - tạo server tại: http://localhost:8298
 docker run -itd --gpu=alls -p 8298:8298 -v ./pretrained-models:/app/pretrained-models -n viet-tts-service viet-tts:latest viettts server --host 0.0.0.0 --port 8298
-
-# Hiển thị danh sách giọng nói sẵn có
-docker exec viet-tts-service viettts show-voices
 ```
 
 ## 🚀 Sử dụng
diff --git a/pyproject.toml b/pyproject.toml
index 3189f0a..d4b1b78 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.poetry]
-name = "viet-tts"
+name = "viettts"
 version = "0.1.0"
 description = "VietTTS: An Open-Source Vietnamese Text to Speech"
 authors = ["dangvansam <dangvansam98@gmail.com>"]
@@ -8,17 +8,14 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = "^3.10"
 conformer = "0.3.2"
-deepspeed = "0.14.2"
 diffusers = "0.27.2"
 gradio = "4.32.2"
 hydra-core = "1.3.2"
 hyperpyyaml = "1.2.2"
 librosa = "0.10.2"
-networkx = "3.1"
 omegaconf = "2.3.0"
 onnx = "1.16.0"
 onnxruntime-gpu = "1.16.0"
-openai-whisper = "20231117"
 protobuf = "4.25"
 pydantic = "2.7.0"
 soundfile = "0.12.1"
@@ -29,7 +26,6 @@ wget = "3.2"
 fastapi = "0.111.0"
 fastapi-cli = "0.0.4"
 loguru = "0.7.2"
-natsort = "8.4.0"
 vinorm = "^2.0.7"
 huggingface-hub = "0.24.7"
 click = "^8.1.7"
@@ -37,7 +33,7 @@ gunicorn = "^23.0.0"
 silero-vad = "^5.1.2"
 
 [tool.poetry.scripts]
-viet-tts = "viettts.cli:cli"
+viettts = "viettts.cli:cli"
 
 [build-system]
 requires = ["poetry-core"]
diff --git a/viettts/cli.py b/viettts/cli.py
index e9dc288..957469c 100644
--- a/viettts/cli.py
+++ b/viettts/cli.py
@@ -107,7 +107,8 @@ def cli():
     """
     VietTTS CLI v0.1.0
     
-    Vietnamese Text To Speech and Voice Clone - License: Apache 2.0 - Author: <dangvansam dangvansam98@gmail.com>
+    Vietnamese Text To Speech and Voice Clone
+    License: Apache 2.0 - Author: <dangvansam dangvansam98@gmail.com>
     """
     pass
 
diff --git a/viettts/server.py b/viettts/server.py
index 0fdca90..6d470bd 100644
--- a/viettts/server.py
+++ b/viettts/server.py
@@ -1,18 +1,22 @@
 import io
 import os
 import queue
+import random
 import subprocess
 import threading
 import wave
 
+import tempfile
+import shutil
+import requests
 import numpy as np
 from loguru import logger
 from typing import Any, List, Optional
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
 from anyio import CapacityLimiter
 from anyio.lowlevel import RunVar
-from fastapi import FastAPI, UploadFile, Form, File
-from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse
+from fastapi import FastAPI, UploadFile, Form, File, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse, FileResponse
 from fastapi.middleware.cors import CORSMiddleware
 
 from viettts.tts import TTS
@@ -27,8 +31,12 @@
 
 
 app = FastAPI(
-    title="VietTTS",
-    description="VietTTS API"
+    title="VietTTS API",
+    description="""
+    VietTTS API (https://github.comdangvansam/viet-tts)
+    Vietnamese Text To Speech and Voice Clone
+    License: Apache 2.0 - Author: <dangvansam dangvansam98@gmail.com>
+    """
 )
 app.add_middleware(
     CORSMiddleware,
@@ -47,13 +55,17 @@ def generate_data(model_output):
     yield audio
 
 
-class GenerateSpeechRequest(BaseModel):
+class OpenAITTSRequest(BaseModel):
     input: str
     model: str = "tts-1"
-    voice: str = list(VOICE_MAP)[1]
+    voice: str = random.choice(list(VOICE_MAP))
     response_format: str = "wav"
     speed: float = 1.0
 
+class TTSRequest(BaseModel):
+    text: str
+    voice: str = random.choice(list(VOICE_MAP))
+    speed: float = 1.0
 
 def wav_chunk_header(sample_rate=22050, bit_depth=16, channels=1):
     buffer = io.BytesIO()
@@ -69,7 +81,11 @@ def wav_chunk_header(sample_rate=22050, bit_depth=16, channels=1):
 
 @app.get("/", response_class=PlainTextResponse)
 async def root():
-    return 'VietTTS API Server Is Runing'
+    return 'VietTTS API'
+
+@app.get("/health", response_class=PlainTextResponse)
+async def health():
+    return 'VietTTS API is running...'
 
 @app.get("/voices")
 @app.get("/v1/voices")
@@ -78,8 +94,9 @@ async def show_voices():
 
 @app.post("/audio/speech")
 @app.post("/v1/audio/speech")
-async def tts(tts_request: GenerateSpeechRequest):
-    logger.info(f"Generate speech request: {tts_request.dict()}")
+async def openai_api_tts(tts_request: OpenAITTSRequest):
+    logger.info(f"Received TTS request: {tts_request.dict()}")
+    
     if tts_request.voice.isdigit():
         voice_file = list(VOICE_MAP.values())[int(tts_request.voice)]
     else:
@@ -199,6 +216,113 @@ async def cleanup():
         background=cleanup
     )
 
+@app.post("/tts")
+@app.post("/v1/tts")
+async def tts(
+    text: str = Form(...),
+    voice: str = Form("0"),
+    speed: float = Form(1.0),
+    audio_url: str = Form(None),
+    audio_file: UploadFile = File(None)
+):
+    logger.info(f"Received TTS request: text={text}, voice={voice}, speed={speed}, audio_url={audio_url}")
+    voice_file = None
+
+    # Case 1: Uploaded audio file
+    if audio_file:
+        temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        try:
+            with open(temp_audio_file.name, "wb") as temp_file:
+                shutil.copyfileobj(audio_file.file, temp_file)
+            voice_file = temp_audio_file.name
+            logger.info(f"Using uploaded audio file as voice: {voice_file}")
+        finally:
+            audio_file.file.close()
+
+    # Case 2: Audio URL
+    elif audio_url:
+        temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        try:
+            response = requests.get(audio_url, stream=True)
+            if response.status_code != 200:
+                raise HTTPException(status_code=400, detail="Failed to fetch audio from URL")
+            with open(temp_audio_file.name, "wb") as temp_file:
+                shutil.copyfileobj(response.raw, temp_file)
+            voice_file = temp_audio_file.name
+            logger.info(f"Using audio URL as voice: {voice_file}")
+        finally:
+            response.close()
+
+    # Case 3: Predefined voice
+    elif voice:
+        if voice.isdigit():
+            voice_file = list(VOICE_MAP.values())[int(voice)]
+        else:
+            voice_file = VOICE_MAP.get(voice)
+
+        if not voice_file:
+            logger.error(f"Voice {voice} not found")
+            raise HTTPException(status_code=404, detail="Voice not found")
+    
+    else:
+        voice_file = random.choice(list(VOICE_MAP.values()))
+
+    # Error if no voice file is available
+    if not voice_file or not os.path.exists(voice_file):
+        raise HTTPException(status_code=400, detail="No valid voice file provided")
+
+    # Load prompt speech
+    prompt_speech_16k = load_prompt_speech_from_file(
+        filepath=voice_file,
+        min_duration=3,
+        max_duration=5
+    )
+
+    # Temporary file for audio output
+    temp_output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+
+    try:
+        # Generate TTS output
+        model_output = tts_obj.inference_tts(
+            tts_text=text,
+            prompt_speech_16k=prompt_speech_16k,
+            speed=speed,
+            stream=False
+        )
+
+        # Combine audio chunks and pass to ffmpeg for processing
+        raw_audio = b''.join(chunk['tts_speech'].numpy().tobytes() for chunk in model_output)
+        ffmpeg_args = [
+            "ffmpeg", "-loglevel", "error", "-y", "-f", "f32le", "-ar", "24000", "-ac", "1",
+            "-i", "-", "-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k", temp_output_file.name
+        ]
+        ffmpeg_proc = subprocess.run(
+            ffmpeg_args,
+            input=raw_audio,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE
+        )
+
+        if ffmpeg_proc.returncode != 0:
+            logger.error(f"FFmpeg error: {ffmpeg_proc.stderr.decode()}")
+            raise HTTPException(status_code=500, detail="Error during audio processing")
+
+        return FileResponse(
+            path=temp_output_file.name,
+            media_type="audio/mpeg",
+            filename="output.mp3"
+        )
+
+    finally:
+        # Clean up temporary files
+        print(temp_output_file.name)
+
+        if os.path.exists(temp_output_file.name):
+            os.unlink(temp_output_file.name)
+        if audio_file or audio_url:
+            if os.path.exists(temp_audio_file.name):
+                os.unlink(temp_audio_file.name)
+
 
 @app.on_event("startup")
 async def startup():

From 6ec5f8aa3fef08d86ecb3ab0e77d51b73c111a7a Mon Sep 17 00:00:00 2001
From: Andrew <andrew@techainer.com>
Date: Wed, 11 Dec 2024 14:58:59 +0700
Subject: [PATCH 2/7] update tts voice clone api and docs

---
 .gitignore                  |  3 +-
 README.md                   |  2 +-
 README_VN.md                |  2 +-
 viettts/flow/flow.py        | 60 ++++++++++++++++++++++++++-----------
 viettts/server.py           | 33 +++++++++++---------
 viettts/utils/file_utils.py | 60 +++++++++++++++++++++++++++++++++++--
 web/.gitkeep                |  0
 7 files changed, 123 insertions(+), 37 deletions(-)
 create mode 100644 web/.gitkeep

diff --git a/.gitignore b/.gitignore
index d1c4380..a4199c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,5 +53,4 @@ node_modules
 pretrained-models/*
 *_pb2_grpc.py
 *_pb2.py
-poetry.lock
-web
\ No newline at end of file
+poetry.lock
\ No newline at end of file
diff --git a/README.md b/README.md
index 6b107c7..32b0b0a 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <!-- # VietTTS: An Open-Source Vietnamese Text to Speech -->
 <p align="center">
-  <img src="assets/viet-tts-medium.png" style="width: 22%">
+  <img src="assets/viet-tts-medium.png" style="width: 200px">
   <h1 align="center"style="color: white; font-weight: bold; font-family:roboto"><span style="color: white; font-weight: bold; font-family:roboto">VietTTS</span>: An Open-Source Vietnamese Text to Speech</h1>
 </p>
 <p align="center">
diff --git a/README_VN.md b/README_VN.md
index f4f9f12..6e34fb3 100644
--- a/README_VN.md
+++ b/README_VN.md
@@ -1,5 +1,5 @@
 <p align="center">
-  <img src="assets/viet-tts-medium.png" style="width: 22%">
+  <img src="assets/viet-tts-medium.png" style="width: 200px">
   <h1 align="center" style="color: white; font-weight: bold; font-family:roboto"><span style="color: white; font-weight: bold; font-family:roboto">VietTTS</span>: Công cụ chuyển văn bản thành giọng nói tiếng Việt mã nguồn mở</h1>
 </p>
 <p align="center">
diff --git a/viettts/flow/flow.py b/viettts/flow/flow.py
index 9408037..44375a2 100644
--- a/viettts/flow/flow.py
+++ b/viettts/flow/flow.py
@@ -10,23 +10,49 @@
 
 class MaskedDiffWithXvec(torch.nn.Module):
     def __init__(self,
-                 input_size: int = 512,
-                 output_size: int = 80,
-                 spk_embed_dim: int = 192,
-                 output_type: str = "mel",
-                 vocab_size: int = 4096,
-                 input_frame_rate: int = 50,
-                 only_mask_loss: bool = True,
-                 encoder: torch.nn.Module = None,
-                 length_regulator: torch.nn.Module = None,
-                 decoder: torch.nn.Module = None,
-                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
-                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
-                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
-                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
-                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
-                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050,
-                                        'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
+                input_size: int = 512,
+                output_size: int = 80,
+                spk_embed_dim: int = 192,
+                output_type: str = "mel",
+                vocab_size: int = 4096,
+                input_frame_rate: int = 50,
+                only_mask_loss: bool = True,
+                encoder: torch.nn.Module = None,
+                length_regulator: torch.nn.Module = None,
+                decoder: torch.nn.Module = None,
+                decoder_conf: Dict = {
+                    'in_channels': 240,
+                    'out_channel': 80,
+                    'spk_emb_dim': 80,
+                    'n_spks': 1,
+                    'cfm_params': DictConfig({
+                        'sigma_min': 1e-06,
+                        'solver': 'euler',
+                        't_scheduler': 'cosine',
+                        'training_cfg_rate': 0.2,
+                        'inference_cfg_rate': 0.7,
+                        'reg_loss_type': 'l1'
+                    }),
+                    'decoder_params': {
+                        'channels': [256, 256],
+                        'dropout': 0.0,
+                        'attention_head_dim': 64,
+                        'n_blocks': 4,
+                        'num_mid_blocks': 12,
+                        'num_heads': 8,
+                        'act_fn': 'gelu'
+                    }
+                },
+                mel_feat_conf: Dict = {
+                    'n_fft': 1024,
+                    'num_mels': 80,
+                    'sampling_rate': 22050,
+                    'hop_size': 256,
+                    'win_size': 1024,
+                    'fmin': 0,
+                    'fmax': 8000
+                }
+            ):
         super().__init__()
         self.input_size = input_size
         self.output_size = output_size
diff --git a/viettts/server.py b/viettts/server.py
index 6d470bd..6902b24 100644
--- a/viettts/server.py
+++ b/viettts/server.py
@@ -11,6 +11,7 @@
 import requests
 import numpy as np
 from loguru import logger
+from datetime import datetime
 from typing import Any, List, Optional
 from pydantic import BaseModel
 from anyio import CapacityLimiter
@@ -33,7 +34,7 @@
 app = FastAPI(
     title="VietTTS API",
     description="""
-    VietTTS API (https://github.comdangvansam/viet-tts)
+    VietTTS API (https://github.com/dangvansam/viet-tts)
     Vietnamese Text To Speech and Voice Clone
     License: Apache 2.0 - Author: <dangvansam dangvansam98@gmail.com>
     """
@@ -230,7 +231,10 @@ async def tts(
 
     # Case 1: Uploaded audio file
     if audio_file:
-        temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        temp_audio_file = tempfile.NamedTemporaryFile(
+            delete=False,
+            suffix=f'.{audio_file.filename.split(".")[-1]}'
+        )
         try:
             with open(temp_audio_file.name, "wb") as temp_file:
                 shutil.copyfileobj(audio_file.file, temp_file)
@@ -241,7 +245,10 @@ async def tts(
 
     # Case 2: Audio URL
     elif audio_url:
-        temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        temp_audio_file = tempfile.NamedTemporaryFile(
+            delete=False,
+            suffix=f'.{audio_url.lower().split(".")[-1]}'
+        )
         try:
             response = requests.get(audio_url, stream=True)
             if response.status_code != 200:
@@ -271,18 +278,18 @@ async def tts(
     if not voice_file or not os.path.exists(voice_file):
         raise HTTPException(status_code=400, detail="No valid voice file provided")
 
-    # Load prompt speech
     prompt_speech_16k = load_prompt_speech_from_file(
         filepath=voice_file,
         min_duration=3,
         max_duration=5
     )
 
-    # Temporary file for audio output
-    temp_output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+    temp_output_file = tempfile.NamedTemporaryFile(
+        delete=False, 
+        suffix=f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
+    )
 
     try:
-        # Generate TTS output
         model_output = tts_obj.inference_tts(
             tts_text=text,
             prompt_speech_16k=prompt_speech_16k,
@@ -290,7 +297,6 @@ async def tts(
             stream=False
         )
 
-        # Combine audio chunks and pass to ffmpeg for processing
         raw_audio = b''.join(chunk['tts_speech'].numpy().tobytes() for chunk in model_output)
         ffmpeg_args = [
             "ffmpeg", "-loglevel", "error", "-y", "-f", "f32le", "-ar", "24000", "-ac", "1",
@@ -307,18 +313,17 @@ async def tts(
             logger.error(f"FFmpeg error: {ffmpeg_proc.stderr.decode()}")
             raise HTTPException(status_code=500, detail="Error during audio processing")
 
+        if not os.path.exists(temp_output_file.name):
+            logger.error(f"FFmpeg did not create the output file: {temp_output_file.name}")
+            raise HTTPException(status_code=500, detail="FFmpeg failed to produce the output file")
+
         return FileResponse(
             path=temp_output_file.name,
             media_type="audio/mpeg",
-            filename="output.mp3"
+            filename=temp_output_file.name.split("/")[-1]
         )
 
     finally:
-        # Clean up temporary files
-        print(temp_output_file.name)
-
-        if os.path.exists(temp_output_file.name):
-            os.unlink(temp_output_file.name)
         if audio_file or audio_url:
             if os.path.exists(temp_audio_file.name):
                 os.unlink(temp_audio_file.name)
diff --git a/viettts/utils/file_utils.py b/viettts/utils/file_utils.py
index b1d91f9..66d2a27 100644
--- a/viettts/utils/file_utils.py
+++ b/viettts/utils/file_utils.py
@@ -1,19 +1,75 @@
 import os
+import subprocess
 import torchaudio
 import soundfile
 import numpy as np
 from glob import glob
+from loguru import logger
 from huggingface_hub import snapshot_download
 
 from viettts.utils.vad import get_speech
 
+import torchaudio
+import os
+import subprocess
+import tempfile
+
+
+def convert_to_wav(input_filepath: str, target_sr: int) -> str:
+    """
+    Convert an input audio file to WAV format with the desired sample rate using FFmpeg.
+
+    Args:
+        input_filepath (str): Path to the input audio file.
+        target_sr (int): Target sample rate.
+
+    Returns:
+        str: Path to the converted WAV file.
+    """
+    temp_wav_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    temp_wav_filepath = temp_wav_file.name
+    temp_wav_file.close()
+
+    ffmpeg_command = [
+        "ffmpeg", "-y",
+        "-loglevel", "error",
+        "-i", input_filepath,
+        "-ar", str(target_sr),
+        "-ac", "1",
+        temp_wav_filepath
+    ]
+
+    result = subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if result.returncode != 0:
+        os.unlink(temp_wav_filepath)
+        raise RuntimeError(f"FFmpeg conversion failed: {result.stderr.decode()}")
+
+    return temp_wav_filepath
+
 
 def load_wav(filepath: str, target_sr: int):
+    """
+    Load an audio file in any supported format, convert it to WAV, and load as a tensor.
+
+    Args:
+        filepath (str): Path to the audio file in any format.
+        target_sr (int): Target sample rate.
+
+    Returns:
+        Tensor: Loaded audio tensor resampled to the target sample rate.
+    """
+    # Check if the file is already in WAV format
+    if not filepath.lower().endswith(".wav"):
+        logger.info(f"Converting {filepath} to WAV format")
+        filepath = convert_to_wav(filepath, target_sr)
+
+    # Load the WAV file
     speech, sample_rate = torchaudio.load(filepath)
-    speech = speech.mean(dim=0, keepdim=True)
+    speech = speech.mean(dim=0, keepdim=True)  # Convert to mono if not already
     if sample_rate != target_sr:
-        assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
+        assert sample_rate > target_sr, f'WAV sample rate {sample_rate} must be greater than {target_sr}'
         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
+
     return speech
 
 
diff --git a/web/.gitkeep b/web/.gitkeep
new file mode 100644
index 0000000..e69de29

From 3e9fd39b33808d32c0f3d0632a3a720e485d4615 Mon Sep 17 00:00:00 2001
From: Andrew <andrew@techainer.com>
Date: Wed, 11 Dec 2024 16:25:49 +0700
Subject: [PATCH 3/7] update docs

---
 README.md      | 11 +++++++----
 README_VN.md   | 11 +++++++----
 viettts/cli.py | 19 ++++++++-----------
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 32b0b0a..bf5fa91 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@
 
 ## 🛠️ Installation
 
-VietTTS can be installed via either a Python installer or Docker.
+VietTTS can be installed via a Python installer (Linux only, with Windows and macOS support coming soon) or Docker.
 
 ### Python Installer
 ```bash
@@ -105,11 +105,14 @@ viettts --help
 # Start API Server
 viettts server --host 0.0.0.0 --port 8298
 
-# Synthesis speech from text
-viettts synthesis --text "Xin chào" --voice 0 --output test.wav
-
 # List all built-in voices
 viettts show-voices
+
+# Synthesize speech from text with built-in voices
+viettts synthesis --text "Xin chào" --voice 0 --output test.wav
+
+# Clone voice from a local audio file
+viettts synthesis --text "Xin chào" --voice Download/voice.wav --output cloned.wav
 ```
 
 ### API Client
diff --git a/README_VN.md b/README_VN.md
index 6e34fb3..5c2f813 100644
--- a/README_VN.md
+++ b/README_VN.md
@@ -21,7 +21,7 @@
 - **VC**: Chuyển đổi giọng nói (TODO)
 
 ## 🛠️ Cài đặt
-VietTTS có thể cài đặt qua trình cài đặt Python hoặc Docker.
+VietTTS có thể được cài đặt qua trình cài đặt Python (chỉ hỗ trợ Linux, Windows và macOS sẽ có trong tương lai) hoặc Docker.
 
 ### Trình cài đặt Python
 
@@ -106,11 +106,14 @@ viettts --help
 # Khởi động API Server
 viettts server --host 0.0.0.0 --port 8298
 
-# Tổng hợp giọng nói từ văn bản
+# Xem tất cả các giọng nói có sẵn
+viettts show-voices
+
+# Tổng hợp giọng nói từ văn bản với giọng có sẵn
 viettts synthesis --text "Xin chào" --voice 0 --output test.wav
 
-# Liệt kê tất cả các giọng nói có sẵn
-viettts show-voices
+# Sao chép giọng từ audio file bất kì
+viettts synthesis --text "Xin chào" --voice Download/voice.wav --output cloned.wav
 ```
 
 ### API Client
diff --git a/viettts/cli.py b/viettts/cli.py
index 957469c..7262d0a 100644
--- a/viettts/cli.py
+++ b/viettts/cli.py
@@ -14,9 +14,9 @@
 MODEL_DIR = 'pretrained-models'
 
 @click.command('server')
-@click.option('-h', '--host', type=str, default='0.0.0.0')
-@click.option('-p', '--port', type=int, default=8298)
-@click.option('-w', '--workers', type=int, default=1)
+@click.option('-h', '--host', type=str, default='0.0.0.0', help="The host address to bind the server to. Default is '0.0.0.0'.")
+@click.option('-p', '--port', type=int, default=8298, help="The port number to bind the server to. Default is 8298.")
+@click.option('-w', '--workers', type=int, default=1, help="The number of worker processes to handle requests. Default is 1.")
 def start_server(host: str, port: int, workers: int):
     """Start API server (OpenAI TTS API compatible).
 
@@ -37,17 +37,14 @@ def start_server(host: str, port: int, workers: int):
 
 
 @click.command('synthesis')
-@click.option('-t', "--text", type=str, required=True)
-@click.option('-v', "--voice", type=str, default='1')
-@click.option('-s', "--speed", type=float, default=1)
-@click.option('-o', "--output", type=str, default='output.wav')
+@click.option('-t', "--text", type=str, required=True, help="The input text to synthesize into speech.")
+@click.option('-v', "--voice", type=str, default='1', help="The voice ID or file path to clone the voice from. Default is '1'.")
+@click.option('-s', "--speed", type=float, default=1, help="The speed multiplier for the speech. Default is 1 (normal speed).")
+@click.option('-o', "--output", type=str, default='output.wav', help="The file path to save the synthesized audio. Default is 'output.wav'.")
 def synthesis(text: str, voice: str, speed: float, output: str):
     """Synthesis audio from text and save to file.
 
-    Usage:
-        viettts synthesis --text 'Xin chào VietTTS' --voice nu-nhe-nhang --output test_nu-nhe-nhang.wav
-        viettts synthesis --text 'Chào bạn đến với Hà Nội' --voice 8 --speed 1.2 --output test_voice_8_speed_1.2.wav
-        viettts synthesis --text 'Bạn có thể sao chép giọng sẵn có' --voice Downloads/audio.wav
+    Usage: viettts synthesis --text 'Xin chào VietTTS' --voice nu-nhe-nhang --voice 8 --speed 1.2 --output test_nu-nhe-nhang.wav
     """
     logger.info("Starting synthesis")
     st = time.perf_counter()

From 38fe8cc1a8ca1e96301bfa48df4d064b7c841900 Mon Sep 17 00:00:00 2001
From: Andrew <andrew@techainer.com>
Date: Wed, 11 Dec 2024 16:36:56 +0700
Subject: [PATCH 4/7] update docs

---
 README.md    | 24 +++++++++++++++++-------
 README_VN.md | 10 ++++++++++
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index bf5fa91..867aa16 100644
--- a/README.md
+++ b/README.md
@@ -144,14 +144,24 @@ with client.audio.speech.with_streaming_response.create(
 
 #### CURL
 ```bash
+# Get all built-in voices
+curl --location http://0.0.0.0:8298/v1/voices
+
+# OpenAI format (bult-in voices)
 curl http://localhost:8298/v1/audio/speech \
-  -H "Authorization: Bearer viet-tts" \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "tts-1",
-    "input": "Xin chào Việt Nam.",
-    "voice": "son-tung-mtp"
-  }' \
+  -H "Authorization: Bearer viet-tts" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "tts-1",
+    "input": "Xin chào Việt Nam.",
+    "voice": "son-tung-mtp"
+  }' \
+  --output speech.wav
+
+# API with voice from local file
+curl --location http://0.0.0.0:8298/v1/tts \
+  --form 'text="xin chào"' \
+  --form 'audio_file=@"/home/viettts/Downloads/voice.mp4"' \
   --output speech.wav
 ```
 
diff --git a/README_VN.md b/README_VN.md
index 5c2f813..f012987 100644
--- a/README_VN.md
+++ b/README_VN.md
@@ -149,6 +149,10 @@ with client.audio.speech.with_streaming_response.create(
 
 #### CURL
 ```bash
+# Lấy danh sách giọng có sẵn
+curl --location http://0.0.0.0:8298/v1/voices
+
+# OpenAI API format
 curl http://localhost:8298/v1/audio/speech \
   -H "Authorization: Bearer viet-tts" \
   -H "Content-Type: application/json" \
@@ -158,6 +162,12 @@ curl http://localhost:8298/v1/audio/speech \
     "voice": "son-tung-mtp"
   }' \
   --output speech.wav
+
+# API với giọng từ file local
+curl --location http://0.0.0.0:8298/v1/tts \
+  --form 'text="xin chào"' \
+  --form 'audio_file=@"/home/viettts/Downloads/voice.mp4"' \
+  --output speech.wav
 ```
 
 #### Node

From 9cac2b2ee31bde500f383e9d389ed0e5cc072ac9 Mon Sep 17 00:00:00 2001
From: Andrew <andrew@techainer.com>
Date: Wed, 11 Dec 2024 16:51:46 +0700
Subject: [PATCH 5/7] update requirements

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index d4b1b78..fd4751a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,7 @@ huggingface-hub = "0.24.7"
 click = "^8.1.7"
 gunicorn = "^23.0.0"
 silero-vad = "^5.1.2"
+whisper = "^1.1.10"
 
 [tool.poetry.scripts]
 viettts = "viettts.cli:cli"

From 90e8fe136935d91b75912f30694a99689bf2861d Mon Sep 17 00:00:00 2001
From: Andrew <andrew@techainer.com>
Date: Wed, 11 Dec 2024 16:54:41 +0700
Subject: [PATCH 6/7] add tiktoken

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index fd4751a..43235dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ click = "^8.1.7"
 gunicorn = "^23.0.0"
 silero-vad = "^5.1.2"
 whisper = "^1.1.10"
+tiktoken = "^0.8.0"
 
 [tool.poetry.scripts]
 viettts = "viettts.cli:cli"

From 8932564b1b2bafabb379f45fe02840d29273e17d Mon Sep 17 00:00:00 2001
From: Andrew <andrew@techainer.com>
Date: Wed, 11 Dec 2024 17:02:34 +0700
Subject: [PATCH 7/7] add whisper

---
 README.md      | 2 +-
 README_VN.md   | 2 +-
 pyproject.toml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 867aa16..746b71e 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@
 
 ## ⭐ Key Features
 - **TTS**: Text-to-Speech generation with any voice via prompt audio
-- **VC**: Voice Conversion (TODO)
+- **OpenAI-API-compatible**: Compatible with OpenAI's Text-to-Speech API format
 
 ## 🛠️ Installation
 
diff --git a/README_VN.md b/README_VN.md
index f012987..1cbdf26 100644
--- a/README_VN.md
+++ b/README_VN.md
@@ -18,7 +18,7 @@
 
 ## ⭐ Tính năng nổi bật
 - **TTS**: Tổng hợp giọng nói từ văn bản với bất kỳ giọng nào qua audio mẫu
-- **VC**: Chuyển đổi giọng nói (TODO)
+- **OpenAI-API-compatible**: Tương thích với API Text to Speech OpenAI
 
 ## 🛠️ Cài đặt
 VietTTS có thể được cài đặt qua trình cài đặt Python (chỉ hỗ trợ Linux, Windows và macOS sẽ có trong tương lai) hoặc Docker.
diff --git a/pyproject.toml b/pyproject.toml
index 43235dd..f850dcd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,8 +31,8 @@ huggingface-hub = "0.24.7"
 click = "^8.1.7"
 gunicorn = "^23.0.0"
 silero-vad = "^5.1.2"
-whisper = "^1.1.10"
 tiktoken = "^0.8.0"
+openai-whisper = "^20240930"
 
 [tool.poetry.scripts]
 viettts = "viettts.cli:cli"