From 13ad518ccac22ed4459a94195ac87c568ea263a9 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 6 Dec 2023 17:43:10 +0100 Subject: [PATCH 1/9] Inference_only --- demo.py | 27 +++++++++++++++++++++++++++ server/main.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 demo.py diff --git a/demo.py b/demo.py new file mode 100644 index 0000000..eccd871 --- /dev/null +++ b/demo.py @@ -0,0 +1,27 @@ +import gradio as gr +import requests + +def clone_speaker(audio, speaker_name, state_vars): + embeddings = requests.post( + "http://localhost:8000/clone_speaker", + json={"wav_file": audio} + ).json() + state_vars[speaker_name] = embeddings + return state_vars + +def tts(text, speaker_name, state_vars): + embeddings = state_vars[speaker_name] + generated_audio = requests.post( + "http://localhost:8000/tts", + json={ + "text": text, + "language": "en", + "speaker_embedding": embeddings["speaker_embedding"], + "gpt_cond_latent": embeddings["gpt_cond_latent"] + } + ).content + return generated_audio + + + + diff --git a/server/main.py b/server/main.py index 0e83cd9..b4c9d3d 100644 --- a/server/main.py +++ b/server/main.py @@ -164,3 +164,48 @@ def predict_streaming_endpoint(parsed_input: StreamingInputs): predict_streaming_generator(parsed_input), media_type="audio/wav", ) + +class TTSInputs(BaseModel): + speaker_embedding: List[float] + gpt_cond_latent: List[List[float]] + text: str + language: Literal[ + "en", + "de", + "fr", + "es", + "it", + "pl", + "pt", + "tr", + "ru", + "nl", + "cs", + "ar", + "zh", + "ja", + "hu", + "ko", + ] + +@app.post("/tts") +def predict_speech(parsed_input: TTSInputs): + speaker_embedding = ( + torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1) + ).cuda() + gpt_cond_latent = ( + torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0) + ).cuda() + text = parsed_input.text + language = parsed_input.language + + out = model.inference( + text, + language, + gpt_cond_latent, + speaker_embedding, + ) + + wav = postprocess(torch.tensor(out["wav"])) + + return encode_audio_common(wav.tobytes()) From 998fe19260744f41d76b0556338ca1e2031bfb04 Mon Sep 17 00:00:00 2001 From: Julian Weber Date: Fri, 8 Dec 2023 11:46:47 +0100 Subject: [PATCH 2/9] Add new endpoints --- server/main.py | 74 ++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/server/main.py b/server/main.py index b4c9d3d..6a91204 100644 --- a/server/main.py +++ b/server/main.py @@ -31,20 +31,26 @@ else: print("Loading default model", flush=True) model_name = "tts_models/multilingual/multi-dataset/xtts_v2" - print("Downloading XTTS Model:",model_name, flush=True) + print("Downloading XTTS Model:", model_name, flush=True) ModelManager().download_model(model_name) model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--")) - print("XTTS Model downloaded",flush=True) + print("XTTS Model downloaded", flush=True) -print("Loading XTTS",flush=True) + # Temporary fix, to wait for the TTS update + import requests + speakers = requests.get("https://huggingface.co/coqui/XTTS-v2/resolve/main/speakers.pth") + with open(os.path.join(model_path, "speakers.pth"), "wb") as fp: + fp.write(speakers.content) + +print("Loading XTTS", flush=True) config = XttsConfig() config.load_json(os.path.join(model_path, "config.json")) model = Xtts.init_from_config(config) model.load_checkpoint(config, checkpoint_dir=model_path, eval=True, use_deepspeed=True) model.to(device) -print("XTTS Loaded.",flush=True) +print("XTTS Loaded.", flush=True) -print("Running XTTS Server ...",flush=True) +print("Running XTTS Server ...", flush=True) ##### Run fastapi ##### app = FastAPI( @@ -104,24 +110,7 @@ class StreamingInputs(BaseModel): speaker_embedding: List[float] gpt_cond_latent: List[List[float]] text: str - language: Literal[ - "en", - "de", - "fr", - "es", - "it", - "pl", - "pt", - "tr", - "ru", - "nl", - "cs", - "ar", - "zh", - "ja", - "hu", - "ko", - ] + language: str add_wav_header: bool = True stream_chunk_size: str = "20" @@ -169,24 +158,7 @@ class TTSInputs(BaseModel): speaker_embedding: List[float] gpt_cond_latent: List[List[float]] text: str - language: Literal[ - "en", - "de", - "fr", - "es", - "it", - "pl", - "pt", - "tr", - "ru", - "nl", - "cs", - "ar", - "zh", - "ja", - "hu", - "ko", - ] + language: str @app.post("/tts") def predict_speech(parsed_input: TTSInputs): @@ -209,3 +181,23 @@ def predict_speech(parsed_input: TTSInputs): wav = postprocess(torch.tensor(out["wav"])) return encode_audio_common(wav.tobytes()) + + +@app.get("/studio_speakers") +def get_speakers(): + speaker_file = os.path.join(model_path, "speakers.pth") + if os.path.isfile(speaker_file): + speakers = torch.load(speaker_file) + return { + speaker: { + "speaker_embedding": speakers[speaker]["speaker_embedding"].cpu().squeeze().half().tolist(), + "gpt_cond_latent": speakers[speaker]["gpt_cond_latent"].cpu().squeeze().half().tolist(), + } + for speaker in speakers.keys() + } + else: + return {} + +@app.get("/languages") +def get_speakers(): + return config.languages \ No newline at end of file From 02ab81c291e84caafda15f0edce9fff5323d22ef Mon Sep 17 00:00:00 2001 From: Julian Weber Date: Fri, 8 Dec 2023 11:46:58 +0100 Subject: [PATCH 3/9] Add demo --- .gitignore | 1 + demo.py | 70 +++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 63 insertions(+), 8 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..181fd98 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +demo_outputs \ No newline at end of file diff --git a/demo.py b/demo.py index eccd871..33cfcea 100644 --- a/demo.py +++ b/demo.py @@ -1,18 +1,31 @@ import gradio as gr import requests +import base64 +import tempfile +import os -def clone_speaker(audio, speaker_name, state_vars): +SERVER_URL = 'http://localhost:8000' +try: + STUDIO_SPEAKERS = requests.get(SERVER_URL + "/studio_speakers").json() +except: + raise Exception("Please make sure the server is running first.") + +cloned_speakers = {} + +def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names): + files = {"wav_file": ("reference.wav", open(upload_file, "rb"))} embeddings = requests.post( - "http://localhost:8000/clone_speaker", - json={"wav_file": audio} + SERVER_URL + "/clone_speaker", + files=files, ).json() - state_vars[speaker_name] = embeddings - return state_vars + cloned_speakers[clone_speaker_name] = embeddings + cloned_speaker_names.append(clone_speaker_name) + return cloned_speaker_names, gr.Dropdown.update(choices=cloned_speaker_names) -def tts(text, speaker_name, state_vars): - embeddings = state_vars[speaker_name] +def tts(text, speaker_type, speaker_name_studio, speaker_name_custom): + embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom] generated_audio = requests.post( - "http://localhost:8000/tts", + SERVER_URL + "/tts", json={ "text": text, "language": "en", @@ -20,8 +33,49 @@ def tts(text, speaker_name, state_vars): "gpt_cond_latent": embeddings["gpt_cond_latent"] } ).content + generated_audio = base64.b64decode(generated_audio) + if not os.path.exists("demo_outputs"): + os.mkdir("demo_outputs") + generated_audio_path = os.path.join("demo_outputs", next(tempfile._get_candidate_names()) + ".wav") + with open(generated_audio_path, "wb") as fp: + fp.write(generated_audio) + generated_audio = fp.name return generated_audio +with gr.Blocks() as demo: + cloned_speaker_names = gr.State([]) + with gr.Tab("TTS"): + with gr.Row() as col4: + speaker_name_studio = gr.Dropdown(label="Studio speaker", choices=STUDIO_SPEAKERS.keys()) + speaker_name_custom = gr.Dropdown(label="Cloned speaker", choices=cloned_speaker_names.value) + with gr.Column() as col2: + speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio") + text = gr.Textbox(label="text") + tts_button = gr.Button(value="TTS") + with gr.Column() as col3: + generated_audio = gr.Audio(label="Generated audio", autoplay=True) + with gr.Tab("Clone a new speaker"): + with gr.Column() as col1: + upload_file = gr.Audio(label="Upload reference audio", type="filepath") + clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker") + clone_button = gr.Button(value="Clone speaker") + clone_button.click( + fn=clone_speaker, + inputs=[upload_file, clone_speaker_name, cloned_speaker_names], + outputs=[cloned_speaker_names, speaker_name_custom], + ) + tts_button.click( + fn=tts, + inputs=[text, speaker_type, speaker_name_studio, speaker_name_custom], + outputs=[generated_audio], + ) +if __name__ == "__main__": + demo.launch( + share=True, + debug=True, + server_port=3009, + server_name="0.0.0.0", + ) From d18d7fb5817bd0cbef564fd009de7dfa011fa53a Mon Sep 17 00:00:00 2001 From: Julian Weber Date: Fri, 8 Dec 2023 14:03:43 +0100 Subject: [PATCH 4/9] Fix endpoint name --- server/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/main.py b/server/main.py index 6a91204..507a9dc 100644 --- a/server/main.py +++ b/server/main.py @@ -199,5 +199,5 @@ def get_speakers(): return {} @app.get("/languages") -def get_speakers(): +def get_languages(): return config.languages \ No newline at end of file From 07bda4e6dfdac710ddc2806965291359931e81f2 Mon Sep 17 00:00:00 2001 From: Julian Weber Date: Fri, 8 Dec 2023 15:30:07 +0100 Subject: [PATCH 5/9] Update gradio demo --- demo.py | 74 +++++++++++++++++++++++++++++-------------- test/requirements.txt | 1 + 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/demo.py b/demo.py index 33cfcea..fa4a6c6 100644 --- a/demo.py +++ b/demo.py @@ -2,55 +2,81 @@ import requests import base64 import tempfile +import json import os + SERVER_URL = 'http://localhost:8000' +OUTPUT = "./demo_outputs" +cloned_speakers = {} + +print("Preparing file structure...") +if not os.path.exists(OUTPUT): + os.mkdir(OUTPUT) + os.mkdir(os.path.join(OUTPUT, "cloned_speakers")) + os.mkdir(os.path.join(OUTPUT, "generated_audios")) +elif os.path.exists(os.path.join(OUTPUT, "cloned_speakers")): + print("Loading existing cloned speakers...") + for file in os.listdir(os.path.join(OUTPUT, "cloned_speakers")): + if file.endswith(".json"): + with open(os.path.join(OUTPUT, "cloned_speakers", file), "r") as fp: + cloned_speakers[file[:-5]] = json.load(fp) + print("Available cloned speakers:", ", ".join(cloned_speakers.keys())) + try: + print("Getting metadata from server ...") STUDIO_SPEAKERS = requests.get(SERVER_URL + "/studio_speakers").json() + print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys())) + LANUGAGES = requests.get(SERVER_URL + "/languages").json() + print("Available languages:", ", ".join(LANUGAGES)) except: raise Exception("Please make sure the server is running first.") -cloned_speakers = {} def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names): files = {"wav_file": ("reference.wav", open(upload_file, "rb"))} - embeddings = requests.post( - SERVER_URL + "/clone_speaker", - files=files, - ).json() + embeddings = requests.post(SERVER_URL + "/clone_speaker", files=files).json() + with open(os.path.join(OUTPUT, "cloned_speakers", clone_speaker_name + ".json"), "w") as fp: + json.dump(embeddings, fp) cloned_speakers[clone_speaker_name] = embeddings cloned_speaker_names.append(clone_speaker_name) - return cloned_speaker_names, gr.Dropdown.update(choices=cloned_speaker_names) + return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown.update(choices=cloned_speaker_names) -def tts(text, speaker_type, speaker_name_studio, speaker_name_custom): +def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang): embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom] generated_audio = requests.post( SERVER_URL + "/tts", json={ "text": text, - "language": "en", + "language": lang, "speaker_embedding": embeddings["speaker_embedding"], "gpt_cond_latent": embeddings["gpt_cond_latent"] } ).content - generated_audio = base64.b64decode(generated_audio) - if not os.path.exists("demo_outputs"): - os.mkdir("demo_outputs") - generated_audio_path = os.path.join("demo_outputs", next(tempfile._get_candidate_names()) + ".wav") + generated_audio_path = os.path.join("demo_outputs", "generated_audios", next(tempfile._get_candidate_names()) + ".wav") with open(generated_audio_path, "wb") as fp: - fp.write(generated_audio) - generated_audio = fp.name - return generated_audio + fp.write(base64.b64decode(generated_audio)) + return fp.name with gr.Blocks() as demo: - cloned_speaker_names = gr.State([]) + cloned_speaker_names = gr.State(list(cloned_speakers.keys())) with gr.Tab("TTS"): - with gr.Row() as col4: - speaker_name_studio = gr.Dropdown(label="Studio speaker", choices=STUDIO_SPEAKERS.keys()) - speaker_name_custom = gr.Dropdown(label="Cloned speaker", choices=cloned_speaker_names.value) - with gr.Column() as col2: + with gr.Column() as row4: + with gr.Row() as col4: + speaker_name_studio = gr.Dropdown( + label="Studio speaker", + choices=STUDIO_SPEAKERS.keys(), + value="Asya Anara" if "Asya Anara" in STUDIO_SPEAKERS.keys() else None, + ) + speaker_name_custom = gr.Dropdown( + label="Cloned speaker", + choices=cloned_speaker_names.value, + value=cloned_speaker_names.value[0] if len(cloned_speaker_names.value) != 0 else None, + ) speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio") - text = gr.Textbox(label="text") + with gr.Column() as col2: + lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="en") + text = gr.Textbox(label="text", value="A quick brown fox jumps over the lazy dog.") tts_button = gr.Button(value="TTS") with gr.Column() as col3: generated_audio = gr.Audio(label="Generated audio", autoplay=True) @@ -63,18 +89,18 @@ def tts(text, speaker_type, speaker_name_studio, speaker_name_custom): clone_button.click( fn=clone_speaker, inputs=[upload_file, clone_speaker_name, cloned_speaker_names], - outputs=[cloned_speaker_names, speaker_name_custom], + outputs=[upload_file, clone_speaker_name, cloned_speaker_names, speaker_name_custom], ) tts_button.click( fn=tts, - inputs=[text, speaker_type, speaker_name_studio, speaker_name_custom], + inputs=[text, speaker_type, speaker_name_studio, speaker_name_custom, lang], outputs=[generated_audio], ) if __name__ == "__main__": demo.launch( - share=True, + share=False, debug=True, server_port=3009, server_name="0.0.0.0", diff --git a/test/requirements.txt b/test/requirements.txt index 2c24336..7a68a27 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -1 +1,2 @@ requests==2.31.0 +gradio==3.50.2 From c2f15eca11c0e6ddfdfd7091a1782506f25a87ce Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 11:51:58 +0100 Subject: [PATCH 6/9] Getting language feedback first --- demo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/demo.py b/demo.py index fa4a6c6..1a56343 100644 --- a/demo.py +++ b/demo.py @@ -25,10 +25,10 @@ try: print("Getting metadata from server ...") - STUDIO_SPEAKERS = requests.get(SERVER_URL + "/studio_speakers").json() - print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys())) LANUGAGES = requests.get(SERVER_URL + "/languages").json() print("Available languages:", ", ".join(LANUGAGES)) + STUDIO_SPEAKERS = requests.get(SERVER_URL + "/studio_speakers").json() + print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys())) except: raise Exception("Please make sure the server is running first.") @@ -101,7 +101,7 @@ def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang): if __name__ == "__main__": demo.launch( share=False, - debug=True, + debug=False, server_port=3009, server_name="0.0.0.0", ) From 24b0f6d934cfadd4cc853c0b090ab90b2e136cf6 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 14:20:03 +0100 Subject: [PATCH 7/9] Update readme --- README.md | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 66e985b..a22192e 100644 --- a/README.md +++ b/README.md @@ -13,13 +13,16 @@ CUDA 12.1 version (for newer cards) $ docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda121 ``` -If you have already downloaded v2 model and like to use this server, and using Ubuntu, change your /home/YOUR_USER_NAME +Run with a custom XTTS v2 model (FT or previous versions): ```bash -$ docker run -v /home/YOUR_USER_NAME/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2:/root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2 --env NVIDIA_DISABLE_REQUIRE=1 --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest` +$ docker run -v /path/to/model/folder:/app/tts_models --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest` ``` + Setting the `COQUI_TOS_AGREED` environment variable to `1` indicates you have read and agreed to the terms of the [CPML license](https://coqui.ai/cpml). +(Fine-tuned XTTS models also are under the [CPML license](https://coqui.ai/cpml)) + ## Testing the server 1. Generate audio with the test script: @@ -52,15 +55,9 @@ $ docker run --gpus all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 xtts-stream Setting the `COQUI_TOS_AGREED` environment variable to `1` indicates you have read and agreed to the terms of the [CPML license](https://coqui.ai/cpml). -2. (bis) Run the server container with your own model: - -```bash -docker run -v /path/to/model/folder:/app/tts_models --gpus all --rm -p 8000:80 xtts-stream -``` Make sure the model folder contains the following files: - `config.json` - `model.pth` - `vocab.json` -(Fine-tuned XTTS models also are under the [CPML license](https://coqui.ai/cpml)) \ No newline at end of file From c282c36f4f1c1a69402782b5d87684040938256e Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 12 Dec 2023 18:28:40 +0100 Subject: [PATCH 8/9] Update to latest TTS --- server/main.py | 16 ++++------------ server/requirements.txt | 2 +- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/server/main.py b/server/main.py index 507a9dc..166b4d6 100644 --- a/server/main.py +++ b/server/main.py @@ -36,12 +36,6 @@ model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--")) print("XTTS Model downloaded", flush=True) - # Temporary fix, to wait for the TTS update - import requests - speakers = requests.get("https://huggingface.co/coqui/XTTS-v2/resolve/main/speakers.pth") - with open(os.path.join(model_path, "speakers.pth"), "wb") as fp: - fp.write(speakers.content) - print("Loading XTTS", flush=True) config = XttsConfig() config.load_json(os.path.join(model_path, "config.json")) @@ -185,15 +179,13 @@ def predict_speech(parsed_input: TTSInputs): @app.get("/studio_speakers") def get_speakers(): - speaker_file = os.path.join(model_path, "speakers.pth") - if os.path.isfile(speaker_file): - speakers = torch.load(speaker_file) + if hasattr(model, "speaker_manager") and hasattr(model.speaker_manager, "speakers"): return { speaker: { - "speaker_embedding": speakers[speaker]["speaker_embedding"].cpu().squeeze().half().tolist(), - "gpt_cond_latent": speakers[speaker]["gpt_cond_latent"].cpu().squeeze().half().tolist(), + "speaker_embedding": model.speaker_manager.speakers[speaker]["speaker_embedding"].cpu().squeeze().half().tolist(), + "gpt_cond_latent": model.speaker_manager.speakers[speaker]["gpt_cond_latent"].cpu().squeeze().half().tolist(), } - for speaker in speakers.keys() + for speaker in model.speaker_manager.speakers.keys() } else: return {} diff --git a/server/requirements.txt b/server/requirements.txt index feff0a8..5d1c959 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -1,4 +1,4 @@ -TTS @ git+https://github.com/coqui-ai/TTS@00a870c26abdc06429ffef3e2814b1a1d5b40fff +TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62 uvicorn[standard]==0.23.2 fastapi==0.95.2 deepspeed==0.10.3 From 980ea5704cf1003cb9a1b53088cd9462c2569927 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 12 Dec 2023 18:50:41 +0100 Subject: [PATCH 9/9] Update docs --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a22192e..f4805cf 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,14 @@ the terms of the [CPML license](https://coqui.ai/cpml). ## Testing the server -1. Generate audio with the test script: +### Using the gradio demo + +```bash +$ python -m pip install -r test/requirements.txt +$ python demo.py +``` + +### Using the test script ```bash $ cd test