From 798956eb2b57da55b53f20a65569883d7e237372 Mon Sep 17 00:00:00 2001 From: Patrick Darwinkel Date: Mon, 26 Feb 2024 17:46:18 +0100 Subject: [PATCH] Explicitly add poetry export plugin; change docker compose example; speedup docker cache --- .github/workflows/build_docker_image.yml | 3 +++ Dockerfile | 2 +- README.md | 3 +++ code/app.py | 10 ++++------ code/create_character_embeddings.py | 1 - code/create_dataset.py | 1 - code/inference.py | 1 - code/settings.py | 10 +++++++--- code/utils.py | 5 ++--- docker-compose.example.yml | 4 ++-- 10 files changed, 22 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build_docker_image.yml b/.github/workflows/build_docker_image.yml index 90ae8c5..c3fc479 100644 --- a/.github/workflows/build_docker_image.yml +++ b/.github/workflows/build_docker_image.yml @@ -23,6 +23,9 @@ jobs: - name: Install poetry run: curl -sSL https://install.python-poetry.org | python3 - + - name: Install poetry export plugin + run: poetry self add poetry-plugin-export + - name: Export requirements run: poetry export -o requirements.txt --only main,deploy diff --git a/Dockerfile b/Dockerfile index 3c50cfa..49c9d7a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ WORKDIR /usr/src/app RUN apt-get update && apt-get install -y ffmpeg COPY requirements.txt ./ -COPY code ./ RUN pip install --no-cache-dir -r requirements.txt +COPY code ./ CMD [ "python", "./app.py" ] diff --git a/README.md b/README.md index c1ee683..fd48a5b 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,9 @@ Most voices perform best when narrating medium-length sentences with medium-leng ![This paragraph is a good example of appropriate text input.](/examples/1708512151_ME2_f-player_f-Shepard.ogg) +## Deployment +GitHub Actions automatically produces a fresh image on every push to the `main` branch. See `docker-compose.example.yml` on how it can be deployed. + ## History (and other experiments) I initially [fine-tuned SpeechT5](https://huggingface.co/learn/audio-course/chapter6/fine-tuning), but the results were disappointing. That model very frequently produced garbage and/or hallucinated output for most voices. Interestingly, it also had a very strong bias towards female speakers. diff --git a/code/app.py b/code/app.py index 4826e5e..0a9c4a6 100644 --- a/code/app.py +++ b/code/app.py @@ -1,16 +1,14 @@ """Gradio web interface for ShepardTTS.""" -import os import time from pathlib import Path import gradio as gr import numpy as np -import torch -from torchaudio.io import CodecConfig, StreamWriter - import settings +import torch from app_helpers import description, examples, links +from torchaudio.io import CodecConfig, StreamWriter from utils import load_checkpoint, normalize_line model = load_checkpoint() @@ -83,7 +81,7 @@ def predict( for sentence in out["wav"]: waveform = torch.cat((waveform, QUARTER_SECOND_PAUSE, sentence, QUARTER_SECOND_PAUSE)) - base_filename = f"{os.environ['GRADIO_EXAMPLES_CACHE']}/{int(time.time())}_{character}" + base_filename = f"{settings.GRADIO_EXAMPLES_CACHE}/{int(time.time())}_{character}" if codec_format == "mp3": # Write compressed mp3 @@ -231,4 +229,4 @@ def predict( ) demo.queue(max_size=10) -demo.launch(debug=False, show_api=True, share=False, auth=("shepard", os.environ["SECRET_KEY"])) +demo.launch(debug=False, show_api=True, share=False, auth=("shepard", settings.SECRET_KEY)) diff --git a/code/create_character_embeddings.py b/code/create_character_embeddings.py index ff7c44f..1eeaad5 100644 --- a/code/create_character_embeddings.py +++ b/code/create_character_embeddings.py @@ -3,7 +3,6 @@ from pathlib import Path import torch - from utils import load_checkpoint diff --git a/code/create_dataset.py b/code/create_dataset.py index 2f3664e..9a35246 100644 --- a/code/create_dataset.py +++ b/code/create_dataset.py @@ -5,7 +5,6 @@ import pandas as pd import soundfile from datasets import Audio, Dataset, concatenate_datasets - from utils import normalize_line diff --git a/code/inference.py b/code/inference.py index 6f9ac5c..45ab692 100644 --- a/code/inference.py +++ b/code/inference.py @@ -5,7 +5,6 @@ import numpy as np import torch from torchaudio.io import StreamWriter - from utils import load_checkpoint, normalize_line model = load_checkpoint() diff --git a/code/settings.py b/code/settings.py index d740458..d7848ee 100644 --- a/code/settings.py +++ b/code/settings.py @@ -1,8 +1,12 @@ """Contains some universal settings.""" +import os import torch -CHECKPOINTS_CONFIG_JSON = "./current_model/config.json" -CHECKPOINT_DIR = "./current_model" -CHECKPOINT_VOCAB = "./current_model/vocab.json" +CHECKPOINTS_CONFIG_JSON = "/xtts_model/config.json" +CHECKPOINT_DIR = "/xtts_model/current_model" +CHECKPOINT_VOCAB = "/xtts_model/current_model/vocab.json" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +GRADIO_EXAMPLES_CACHE = os.environ["GRADIO_EXAMPLES_CACHE"] +SECRET_KEY = os.environ["SECRET_KEY"] diff --git a/code/utils.py b/code/utils.py index 5ce05ff..9075195 100644 --- a/code/utils.py +++ b/code/utils.py @@ -2,12 +2,11 @@ import re +import settings from cleantext import clean from num2words import num2words -from TTS.tts.configs.xtts_config import XttsConfig - -import settings from overrides import ShepardXtts +from TTS.tts.configs.xtts_config import XttsConfig def load_checkpoint(): diff --git a/docker-compose.example.yml b/docker-compose.example.yml index 819cecc..df58df8 100644 --- a/docker-compose.example.yml +++ b/docker-compose.example.yml @@ -2,11 +2,11 @@ version: '3.8' services: gradio: - build: . + image: "ghcr.io/darwinkel/shepardtts:main" ports: - "1337:1337" volumes: - - .:/usr/src/app + - ./current_model/:/xtts_model/:ro environment: GRADIO_EXAMPLES_CACHE: "/tmp/" GRADIO_ANALYTICS_ENABLED: False