diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8d639d5dee..7905add3f7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -37,7 +37,6 @@ jobs: sudo apt-get install espeak espeak-ng - name: Install dependencies run: | - sudo apt-get update sudo apt-get install -y --no-install-recommends git make gcc make system-deps - name: Install custom Trainer and/or Coqpit if requested @@ -68,7 +67,7 @@ jobs: fail-fast: false matrix: python-version: ["3.9", "3.12"] - subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"] + subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts"] steps: - uses: actions/checkout@v4 - name: Setup uv @@ -76,13 +75,12 @@ jobs: - name: Set up Python ${{ matrix.python-version }} run: uv python install ${{ matrix.python-version }} - name: Install Espeak - if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset) + if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts"]'), matrix.subset) run: | sudo apt-get update sudo apt-get install espeak espeak-ng - name: Install dependencies run: | - sudo apt-get update sudo apt-get install -y --no-install-recommends git make gcc make system-deps - name: Install custom Trainer and/or Coqpit if requested @@ -107,9 +105,50 @@ jobs: name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }} path: .coverage.* if-no-files-found: ignore + zoo: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.12"] + partition: ["0", "1", "2"] + steps: + - uses: actions/checkout@v4 + - name: Setup uv + uses: ./.github/actions/setup-uv + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} + - name: Install Espeak + run: | + sudo apt-get update + sudo apt-get install espeak espeak-ng + - name: Install dependencies + run: | + sudo apt-get install -y --no-install-recommends git make gcc + make system-deps + - name: Install custom Trainer and/or Coqpit if requested + run: | + if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then + uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }} + fi + if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then + uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }} + fi + - name: Zoo tests + run: uv run --extra server --extra languages make test_zoo + env: + NUM_PARTITIONS: 3 + TEST_PARTITION: ${{ matrix.partition }} + - name: Upload coverage data + uses: actions/upload-artifact@v4 + with: + include-hidden-files: true + name: coverage-data-zoo-${{ matrix.partition }} + path: .coverage.* + if-no-files-found: ignore coverage: if: always() - needs: [unit, integration] + needs: [unit, integration, zoo] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 diff --git a/Makefile b/Makefile index 6964773fb5..2b3705974a 100644 --- a/Makefile +++ b/Makefile @@ -6,48 +6,42 @@ help: target_dirs := tests TTS notebooks recipes -test_all: ## run tests and don't stop on an error. - nose2 --with-coverage --coverage TTS tests - ./run_bash_tests.sh - test: ## run tests. - coverage run -m nose2 -F -v -B tests + coverage run -m pytest -x -v --durations=0 tests test_vocoder: ## run vocoder tests. - coverage run -m nose2 -F -v -B tests.vocoder_tests + coverage run -m pytest -x -v --durations=0 tests/vocoder_tests test_tts: ## run tts tests. - coverage run -m nose2 -F -v -B tests.tts_tests + coverage run -m pytest -x -v --durations=0 tests/tts_tests test_tts2: ## run tts tests. - coverage run -m nose2 -F -v -B tests.tts_tests2 + coverage run -m pytest -x -v --durations=0 tests/tts_tests2 test_xtts: - coverage run -m nose2 -F -v -B tests.xtts_tests + coverage run -m pytest -x -v --durations=0 tests/xtts_tests test_aux: ## run aux tests. - coverage run -m nose2 -F -v -B tests.aux_tests + coverage run -m pytest -x -v --durations=0 tests/aux_tests ./run_bash_tests.sh -test_zoo0: ## run zoo tests. - coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_0_step_3 \ - tests.zoo_tests.test_models.test_voice_conversion -test_zoo1: ## run zoo tests. - coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_1_step_3 -test_zoo2: ## run zoo tests. - coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_2_step_3 +test_zoo: ## run zoo tests. + coverage run -m pytest -x -v --durations=0 tests/zoo_tests/test_models.py + +test_zoo_big: ## run tests for models that are too big for CI. + coverage run -m pytest -x -v --durations=0 tests/zoo_tests/test_big_models.py inference_tests: ## run inference tests. - coverage run -m nose2 -F -v -B tests.inference_tests + coverage run -m pytest -x -v --durations=0 tests/inference_tests data_tests: ## run data tests. - coverage run -m nose2 -F -v -B tests.data_tests + coverage run -m pytest -x -v --durations=0 tests/data_tests test_text: ## run text tests. - coverage run -m nose2 -F -v -B tests.text_tests + coverage run -m pytest -x -v --durations=0 tests/text_tests test_failed: ## only run tests failed the last time. - coverage run -m nose2 -F -v -B tests + coverage run -m pytest -x -v --last-failed tests style: ## update code style. uv run --only-dev black ${target_dirs} diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index b8f69b54e5..8d7a2633a0 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -113,7 +113,7 @@ # compute attentions file_paths = [] - with torch.no_grad(): + with torch.inference_mode(): for data in tqdm(loader): # setup input data text_input = data[0] diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index acec91c369..b7c52ac6c5 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -6,6 +6,7 @@ import logging import os import sys +from typing import Optional import numpy as np from tqdm import tqdm @@ -17,10 +18,7 @@ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger -def main(): - """Run preprocessing process.""" - setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter()) - +def parse_args(arg_list: Optional[list[str]]) -> tuple[argparse.Namespace, list[str]]: parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.") parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.") parser.add_argument("out_path", type=str, help="save path (directory and filename).") @@ -30,7 +28,13 @@ def main(): required=False, help="folder including the target set of wavs overriding dataset config.", ) - args, overrides = parser.parse_known_args() + return parser.parse_known_args(arg_list) + + +def main(arg_list: Optional[list[str]] = None): + """Run preprocessing process.""" + setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter()) + args, overrides = parse_args(arg_list) CONFIG = load_config(args.config_path) CONFIG.parse_known_args(overrides, relaxed_parser=True) @@ -95,6 +99,7 @@ def main(): stats["audio_config"] = CONFIG.audio.to_dict() np.save(output_file_path, stats, allow_pickle=True) print(f" > stats saved to {output_file_path}") + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index a04005ce39..77072f9efa 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -3,8 +3,9 @@ import argparse import logging -import os import sys +from pathlib import Path +from typing import Optional import numpy as np import torch @@ -13,8 +14,10 @@ from trainer.generic_utils import count_parameters from TTS.config import load_config +from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.datasets import TTSDataset, load_tts_samples from TTS.tts.models import setup_model +from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor @@ -24,56 +27,66 @@ use_cuda = torch.cuda.is_available() -def setup_loader(ap, r): - tokenizer, _ = TTSTokenizer.init_from_config(c) +def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True) + parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True) + parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True) + parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug") + parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files") + parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero") + parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True) + return parser.parse_args(arg_list) + + +def setup_loader(config: BaseTTSConfig, ap: AudioProcessor, r, speaker_manager: SpeakerManager, samples) -> DataLoader: + tokenizer, _ = TTSTokenizer.init_from_config(config) dataset = TTSDataset( outputs_per_step=r, compute_linear_spec=False, - samples=meta_data, + samples=samples, tokenizer=tokenizer, ap=ap, batch_group_size=0, - min_text_len=c.min_text_len, - max_text_len=c.max_text_len, - min_audio_len=c.min_audio_len, - max_audio_len=c.max_audio_len, - phoneme_cache_path=c.phoneme_cache_path, + min_text_len=config.min_text_len, + max_text_len=config.max_text_len, + min_audio_len=config.min_audio_len, + max_audio_len=config.max_audio_len, + phoneme_cache_path=config.phoneme_cache_path, precompute_num_workers=0, use_noise_augment=False, - speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None, - d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None, + speaker_id_mapping=speaker_manager.name_to_id if config.use_speaker_embedding else None, + d_vector_mapping=speaker_manager.embeddings if config.use_d_vector_file else None, ) - if c.use_phonemes and c.compute_input_seq_cache: + if config.use_phonemes and config.compute_input_seq_cache: # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(c.num_loader_workers) + dataset.compute_input_seq(config.num_loader_workers) dataset.preprocess_samples() - loader = DataLoader( + return DataLoader( dataset, - batch_size=c.batch_size, + batch_size=config.batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=False, sampler=None, - num_workers=c.num_loader_workers, + num_workers=config.num_loader_workers, pin_memory=False, ) - return loader -def set_filename(wav_path, out_path): - wav_file = os.path.basename(wav_path) - file_name = wav_file.split(".")[0] - os.makedirs(os.path.join(out_path, "quant"), exist_ok=True) - os.makedirs(os.path.join(out_path, "mel"), exist_ok=True) - os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True) - os.makedirs(os.path.join(out_path, "wav"), exist_ok=True) - wavq_path = os.path.join(out_path, "quant", file_name) - mel_path = os.path.join(out_path, "mel", file_name) - wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav") - wav_path = os.path.join(out_path, "wav", file_name + ".wav") - return file_name, wavq_path, mel_path, wav_gl_path, wav_path +def set_filename(wav_path: str, out_path: Path) -> tuple[Path, Path, Path, Path]: + wav_name = Path(wav_path).stem + (out_path / "quant").mkdir(exist_ok=True, parents=True) + (out_path / "mel").mkdir(exist_ok=True, parents=True) + (out_path / "wav_gl").mkdir(exist_ok=True, parents=True) + (out_path / "wav").mkdir(exist_ok=True, parents=True) + wavq_path = out_path / "quant" / wav_name + mel_path = out_path / "mel" / wav_name + wav_gl_path = out_path / "wav_gl" / f"{wav_name}.wav" + out_wav_path = out_path / "wav" / f"{wav_name}.wav" + return wavq_path, mel_path, wav_gl_path, out_wav_path def format_data(data): @@ -115,18 +128,18 @@ def format_data(data): ) -@torch.no_grad() +@torch.inference_mode() def inference( - model_name, - model, - ap, + model_name: str, + model: BaseTTS, + ap: AudioProcessor, text_input, text_lengths, mel_input, mel_lengths, speaker_ids=None, d_vectors=None, -): +) -> np.ndarray: if model_name == "glow_tts": speaker_c = None if speaker_ids is not None: @@ -141,9 +154,9 @@ def inference( aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids}, ) model_output = outputs["model_outputs"] - model_output = model_output.detach().cpu().numpy() + return model_output.detach().cpu().numpy() - elif "tacotron" in model_name: + if "tacotron" in model_name: aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input) postnet_outputs = outputs["model_outputs"] @@ -154,16 +167,24 @@ def inference( for b in range(postnet_outputs.shape[0]): postnet_output = postnet_outputs[b] mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T)) - model_output = torch.stack(mel_specs).cpu().numpy() - - elif model_name == "tacotron2": - model_output = postnet_outputs.detach().cpu().numpy() - return model_output + return torch.stack(mel_specs).cpu().numpy() + if model_name == "tacotron2": + return postnet_outputs.detach().cpu().numpy() + msg = f"Model not supported: {model_name}" + raise ValueError(msg) def extract_spectrograms( - data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt" -): + model_name: str, + data_loader: DataLoader, + model: BaseTTS, + ap: AudioProcessor, + output_path: Path, + quantize_bits: int = 0, + save_audio: bool = False, + debug: bool = False, + metadata_name: str = "metadata.txt", +) -> None: model.eval() export_metadata = [] for _, data in tqdm(enumerate(data_loader), total=len(data_loader)): @@ -182,7 +203,7 @@ def extract_spectrograms( ) = format_data(data) model_output = inference( - c.model.lower(), + model_name, model, ap, text_input, @@ -196,7 +217,7 @@ def extract_spectrograms( for idx in range(text_input.shape[0]): wav_file_path = item_idx[idx] wav = ap.load_wav(wav_file_path) - _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path) + wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path) # quantize and save wav if quantize_bits > 0: @@ -218,74 +239,67 @@ def extract_spectrograms( wav = ap.inv_melspectrogram(mel) ap.save_wav(wav, wav_gl_path) - with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f: + with (output_path / metadata_name).open("w") as f: for data in export_metadata: - f.write(f"{data[0]}|{data[1]+'.npy'}\n") + f.write(f"{data[0] / data[1]}.npy\n") -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data, speaker_manager +def main(arg_list: Optional[list[str]] = None) -> None: + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) + args = parse_args(arg_list) + config = load_config(args.config_path) + config.audio.trim_silence = False # Audio processor - ap = AudioProcessor(**c.audio) + ap = AudioProcessor(**config.audio) # load data instances meta_data_train, meta_data_eval = load_tts_samples( - c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size + config.datasets, + eval_split=args.eval, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, ) # use eval and training partitions meta_data = meta_data_train + meta_data_eval # init speaker manager - if c.use_speaker_embedding: + if config.use_speaker_embedding: speaker_manager = SpeakerManager(data_items=meta_data) - elif c.use_d_vector_file: - speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file) + elif config.use_d_vector_file: + speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) else: speaker_manager = None # setup model - model = setup_model(c) + model = setup_model(config) # restore model - model.load_checkpoint(c, args.checkpoint_path, eval=True) + model.load_checkpoint(config, args.checkpoint_path, eval=True) if use_cuda: model.cuda() num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) + print(f"\n > Model has {num_params} parameters", flush=True) # set r - r = 1 if c.model.lower() == "glow_tts" else model.decoder.r - own_loader = setup_loader(ap, r) + r = 1 if config.model.lower() == "glow_tts" else model.decoder.r + own_loader = setup_loader(config, ap, r, speaker_manager, meta_data) extract_spectrograms( + config.model.lower(), own_loader, model, ap, - args.output_path, + Path(args.output_path), quantize_bits=args.quantize_bits, save_audio=args.save_audio, debug=args.debug, - metada_name="metada.txt", + metadata_name="metadata.txt", ) + sys.exit(0) if __name__ == "__main__": - setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) - - parser = argparse.ArgumentParser() - parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True) - parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True) - parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True) - parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug") - parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files") - parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero") - parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True) - args = parser.parse_args() - - c = load_config(args.config_path) - c.audio.trim_silence = False - main(args) + main() diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index 7c68fdb070..0c453db85b 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -1,10 +1,11 @@ -"""Find all the unique characters in a dataset""" +"""Find all the unique characters in a dataset.""" import argparse import logging import multiprocessing import sys from argparse import RawTextHelpFormatter +from typing import Optional from tqdm.contrib.concurrent import process_map @@ -14,18 +15,13 @@ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger -def compute_phonemes(item): +def compute_phonemes(item: dict) -> set[str]: text = item["text"] ph = phonemizer.phonemize(text).replace("|", "") return set(ph) -def main(): - setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) - - # pylint: disable=W0601 - global c, phonemizer - # pylint: disable=bad-option-value +def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace: parser = argparse.ArgumentParser( description="""Find all the unique characters or phonemes in a dataset.\n\n""" """ @@ -36,13 +32,21 @@ def main(): formatter_class=RawTextHelpFormatter, ) parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) - args = parser.parse_args() + return parser.parse_args(arg_list) - c = load_config(args.config_path) + +def main(arg_list: Optional[list[str]] = None) -> None: + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) + global phonemizer + args = parse_args(arg_list) + config = load_config(args.config_path) # load all datasets train_items, eval_items = load_tts_samples( - c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size + config.datasets, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, ) items = train_items + eval_items print("Num items:", len(items)) @@ -50,13 +54,16 @@ def main(): language_list = [item["language"] for item in items] is_lang_def = all(language_list) - if not c.phoneme_language or not is_lang_def: - raise ValueError("Phoneme language must be defined in config.") + if not config.phoneme_language or not is_lang_def: + msg = "Phoneme language must be defined in config." + raise ValueError(msg) - if not language_list.count(language_list[0]) == len(language_list): - raise ValueError( - "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!" + if language_list.count(language_list[0]) != len(language_list): + msg = ( + "Currently, just one phoneme language per config file is supported !! " + "Please split the dataset config into different configs and run it individually for each language !!" ) + raise ValueError(msg) phonemizer = Gruut(language=language_list[0], keep_puncs=True) @@ -74,6 +81,7 @@ def main(): print(f" > Unique phonemes: {''.join(sorted(phones))}") print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}") print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}") + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 5d20db6a59..8fe30d0bce 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -7,6 +7,7 @@ import logging import sys from argparse import RawTextHelpFormatter +from typing import Optional # pylint: disable=redefined-outer-name, unused-argument from TTS.utils.generic_utils import ConsoleFormatter, setup_logger @@ -134,7 +135,7 @@ """ -def parse_args() -> argparse.Namespace: +def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace: """Parse arguments.""" parser = argparse.ArgumentParser( description=description.replace(" ```\n", ""), @@ -290,7 +291,7 @@ def parse_args() -> argparse.Namespace: help="Voice dir for tortoise model", ) - args = parser.parse_args() + args = parser.parse_args(arg_list) # print the description if either text or list_models is not set check_args = [ @@ -309,9 +310,9 @@ def parse_args() -> argparse.Namespace: return args -def main() -> None: +def main(arg_list: Optional[list[str]] = None) -> None: """Entry point for `tts` command line interface.""" - args = parse_args() + args = parse_args(arg_list) stream = sys.stderr if args.pipe_out else sys.stdout setup_logger("TTS", level=logging.INFO, stream=stream, formatter=ConsoleFormatter()) @@ -340,18 +341,18 @@ def main() -> None: # 1) List pre-trained TTS models if args.list_models: manager.list_models() - sys.exit() + sys.exit(0) # 2) Info about pre-trained TTS models (without loading a model) if args.model_info_by_idx: model_query = args.model_info_by_idx manager.model_info_by_idx(model_query) - sys.exit() + sys.exit(0) if args.model_info_by_name: model_query_full_name = args.model_info_by_name manager.model_info_by_full_name(model_query_full_name) - sys.exit() + sys.exit(0) # 3) Load a model for further info or TTS/VC device = args.device @@ -377,23 +378,23 @@ def main() -> None: if args.list_speaker_idxs: if not api.is_multi_speaker: logger.info("Model only has a single speaker.") - return + sys.exit(0) logger.info( "Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) logger.info(api.speakers) - return + sys.exit(0) # query langauge ids of a multi-lingual model. if args.list_language_idxs: if not api.is_multi_lingual: logger.info("Monolingual model.") - return + sys.exit(0) logger.info( "Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." ) logger.info(api.languages) - return + sys.exit(0) # check the arguments against a multi-speaker model. if api.is_multi_speaker and (not args.speaker_idx and not args.speaker_wav): @@ -401,7 +402,7 @@ def main() -> None: "Looks like you use a multi-speaker model. Define `--speaker_idx` to " "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." ) - return + sys.exit(1) # RUN THE SYNTHESIS if args.text: @@ -430,6 +431,7 @@ def main() -> None: pipe_out=pipe_out, ) logger.info("Saved VC output to %s", args.out_path) + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 84123d2db3..a37ab8efc9 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -87,7 +87,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False): def evaluation(model, criterion, data_loader, global_step): eval_loss = 0 for _, data in enumerate(data_loader): - with torch.no_grad(): + with torch.inference_mode(): # setup input data inputs, labels = data diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py index aa04177068..7cf5696237 100644 --- a/TTS/bin/train_vocoder.py +++ b/TTS/bin/train_vocoder.py @@ -2,6 +2,7 @@ import os import sys from dataclasses import dataclass, field +from typing import Optional from trainer import Trainer, TrainerArgs @@ -17,7 +18,7 @@ class TrainVocoderArgs(TrainerArgs): config_path: str = field(default=None, metadata={"help": "Path to the config file."}) -def main(): +def main(arg_list: Optional[list[str]] = None): """Run `tts` model training directly by a `config.json` file.""" setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) @@ -26,7 +27,7 @@ def main(): parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args - args, config_overrides = parser.parse_known_args() + args, config_overrides = parser.parse_known_args(arg_list) train_args.parse_args(args) # load config.json and register @@ -76,6 +77,7 @@ def main(): parse_command_line_args=False, ) trainer.fit() + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py index 2082019aad..603481cc56 100644 --- a/TTS/encoder/models/base_encoder.py +++ b/TTS/encoder/models/base_encoder.py @@ -64,11 +64,11 @@ def get_torch_mel_spectrogram_class(self, audio_config): ), ) - @torch.no_grad() + @torch.inference_mode() def inference(self, x, l2_norm=True): return self.forward(x, l2_norm) - @torch.no_grad() + @torch.inference_mode() def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True): """ Generate embeddings for a batch of utterances diff --git a/TTS/tts/layers/delightful_tts/acoustic_model.py b/TTS/tts/layers/delightful_tts/acoustic_model.py index 981d6cdb1f..2aa82c9a88 100644 --- a/TTS/tts/layers/delightful_tts/acoustic_model.py +++ b/TTS/tts/layers/delightful_tts/acoustic_model.py @@ -421,7 +421,7 @@ def forward( "spk_emb": speaker_embedding, } - @torch.no_grad() + @torch.inference_mode() def inference( self, tokens: torch.Tensor, diff --git a/TTS/tts/layers/xtts/hifigan_decoder.py b/TTS/tts/layers/xtts/hifigan_decoder.py index 2e6ac01a87..550ad3e3b2 100644 --- a/TTS/tts/layers/xtts/hifigan_decoder.py +++ b/TTS/tts/layers/xtts/hifigan_decoder.py @@ -97,7 +97,7 @@ def forward(self, latents, g=None): o = self.waveform_decoder(z, g=g) return o - @torch.no_grad() + @torch.inference_mode() def inference(self, c, g): """ Args: diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py index 44cf940c69..2f4b54cec1 100644 --- a/TTS/tts/layers/xtts/stream_generator.py +++ b/TTS/tts/layers/xtts/stream_generator.py @@ -45,7 +45,7 @@ def __init__(self, **kwargs): class NewGenerationMixin(GenerationMixin): - @torch.no_grad() + @torch.inference_mode() def generate( # noqa: PLR0911 self, inputs: Optional[torch.Tensor] = None, @@ -662,7 +662,7 @@ def typeerror(): **model_kwargs, ) - @torch.no_grad() + @torch.inference_mode() def sample_stream( self, input_ids: torch.LongTensor, diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index 107054189c..78ac869434 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -225,7 +225,7 @@ def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels ) return losses - @torch.no_grad() + @torch.inference_mode() def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613 test_audios = {} if self.config.test_sentences: @@ -335,7 +335,7 @@ def on_init_end(self, trainer): # pylint: disable=W0613 WeightsFileHandler.add_pre_callback(callback_clearml_load_save) - @torch.no_grad() + @torch.inference_mode() def inference( self, x, diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 28a52bc558..c1d0cf0aea 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -288,7 +288,7 @@ def forward( } return outputs - @torch.no_grad() + @torch.inference_mode() def inference(self, x, aux_input={"d_vectors": None}): # pylint: disable=unused-argument """ Shapes: diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py index e6db116081..000fbd596f 100644 --- a/TTS/tts/models/delightful_tts.py +++ b/TTS/tts/models/delightful_tts.py @@ -622,7 +622,7 @@ def forward( model_outputs["slice_ids"] = slice_ids return model_outputs - @torch.no_grad() + @torch.inference_mode() def inference( self, x, aux_input={"d_vectors": None, "speaker_ids": None}, pitch_transform=None, energy_transform=None ): @@ -646,7 +646,7 @@ def inference( model_outputs["model_outputs"] = vocoder_output return model_outputs - @torch.no_grad() + @torch.inference_mode() def inference_spec_decoder(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): encoder_outputs = self.acoustic_model.inference( tokens=x, @@ -1018,7 +1018,7 @@ def synthesize_with_gl(self, text: str, speaker_id, d_vector): } return return_dict - @torch.no_grad() + @torch.inference_mode() def test_run(self, assets) -> Tuple[Dict, Dict]: """Generic test run for `tts` models used by `Trainer`. diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index d09e3ea91b..03166fa8c0 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -628,7 +628,7 @@ def forward( } return outputs - @torch.no_grad() + @torch.inference_mode() def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument """Model's inference pass. diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 5bf4713140..aaf5190ada 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -262,7 +262,7 @@ def forward( } return outputs - @torch.no_grad() + @torch.inference_mode() def inference_with_MAS( self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} ): # pylint: disable=dangerous-default-value @@ -318,7 +318,7 @@ def inference_with_MAS( } return outputs - @torch.no_grad() + @torch.inference_mode() def decoder_inference( self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} ): # pylint: disable=dangerous-default-value @@ -341,7 +341,7 @@ def decoder_inference( outputs["logdet"] = logdet return outputs - @torch.no_grad() + @torch.inference_mode() def inference( self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None} ): # pylint: disable=dangerous-default-value @@ -464,7 +464,7 @@ def train_log( logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - @torch.no_grad() + @torch.inference_mode() def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) @@ -473,7 +473,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - @torch.no_grad() + @torch.inference_mode() def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: """Generic test run for `tts` models used by `Trainer`. diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py index 0b3fadafbf..b9a23000a0 100644 --- a/TTS/tts/models/neuralhmm_tts.py +++ b/TTS/tts/models/neuralhmm_tts.py @@ -195,7 +195,7 @@ def _format_aux_input(self, aux_input: Dict, default_input_dict): return format_aux_input(default_input_dict, aux_input) return default_input_dict - @torch.no_grad() + @torch.inference_mode() def inference( self, text: torch.Tensor, diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py index 1c146b2eac..10157e43a4 100644 --- a/TTS/tts/models/overflow.py +++ b/TTS/tts/models/overflow.py @@ -209,7 +209,7 @@ def _format_aux_input(self, aux_input: Dict, default_input_dict): return format_aux_input(default_input_dict, aux_input) return default_input_dict - @torch.no_grad() + @torch.inference_mode() def inference( self, text: torch.Tensor, diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 5d3efd2021..da85823f3f 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -218,7 +218,7 @@ def forward( # pylint: disable=dangerous-default-value ) return outputs - @torch.no_grad() + @torch.inference_mode() def inference(self, text_input, aux_input=None): aux_input = self._format_aux_input(aux_input) inputs = self.embedding(text_input) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 2716a39786..e2edd4bb5c 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -238,7 +238,7 @@ def forward( # pylint: disable=dangerous-default-value ) return outputs - @torch.no_grad() + @torch.inference_mode() def inference(self, text, aux_input=None): """Forward pass for inference with no Teacher-Forcing. diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 7ec2519236..b52e337145 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -927,7 +927,7 @@ def _set_x_lengths(x, aux_input): return aux_input["x_lengths"] return torch.tensor(x.shape[1:2]).to(x.device) - @torch.no_grad() + @torch.inference_mode() def inference( self, x, @@ -1014,7 +1014,7 @@ def inference( } return outputs - @torch.no_grad() + @torch.inference_mode() def inference_voice_conversion( self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None ): @@ -1209,7 +1209,7 @@ def train_log( logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - @torch.no_grad() + @torch.inference_mode() def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int): return self.train_step(batch, criterion, optimizer_idx) @@ -1266,7 +1266,7 @@ def get_aux_input_from_test_sentences(self, sentence_info): "language_name": language_name, } - @torch.no_grad() + @torch.inference_mode() def test_run(self, assets) -> Tuple[Dict, Dict]: """Generic test run for `tts` models used by `Trainer`. @@ -1294,7 +1294,7 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: do_trim_silence=False, ).values() test_audios["{}-audio".format(idx)] = wav - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) + test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.permute(2, 1, 0), output_fig=False) return {"figures": test_figures, "audios": test_audios} def test_log( diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py index 3a715dd75d..e009a7c438 100644 --- a/TTS/tts/utils/managers.py +++ b/TTS/tts/utils/managers.py @@ -334,6 +334,7 @@ def init_encoder( ) self.encoder_ap = AudioProcessor(**self.encoder_config.audio) + @torch.inference_mode() def compute_embedding_from_clip( self, wav_file: Union[Union[str, os.PathLike[Any]], List[Union[str, os.PathLike[Any]]]] ) -> list: diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py index 237409556a..8a06d1a4ce 100644 --- a/TTS/vc/models/freevc.py +++ b/TTS/vc/models/freevc.py @@ -389,7 +389,7 @@ def forward( return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - @torch.no_grad() + @torch.inference_mode() def inference(self, c, g=None, mel=None, c_lengths=None): """ Inference pass of the model diff --git a/TTS/vc/models/openvoice.py b/TTS/vc/models/openvoice.py index 60912deed0..3f193b6b69 100644 --- a/TTS/vc/models/openvoice.py +++ b/TTS/vc/models/openvoice.py @@ -228,7 +228,7 @@ def _set_x_lengths(x: torch.Tensor, aux_input: Mapping[str, Optional[torch.Tenso return aux_input["x_lengths"] return torch.tensor(x.shape[1:2]).to(x.device) - @torch.no_grad() + @torch.inference_mode() def inference( self, x: torch.Tensor, diff --git a/TTS/vocoder/models/fullband_melgan_generator.py b/TTS/vocoder/models/fullband_melgan_generator.py index ee25559af0..292d3323bb 100644 --- a/TTS/vocoder/models/fullband_melgan_generator.py +++ b/TTS/vocoder/models/fullband_melgan_generator.py @@ -24,7 +24,7 @@ def __init__( num_res_blocks=num_res_blocks, ) - @torch.no_grad() + @torch.inference_mode() def inference(self, cond_features): cond_features = cond_features.to(self.layers[1].weight.device) cond_features = torch.nn.functional.pad( diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index 8792950a56..7785d8011c 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -212,7 +212,7 @@ def train_log( logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - @torch.no_grad() + @torch.inference_mode() def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: """Call `train_step()` with `no_grad()`""" self.train_disc = True # Avoid a bug in the Training with the missing discriminator loss diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py index 8273d02037..e8f175ed17 100644 --- a/TTS/vocoder/models/hifigan_generator.py +++ b/TTS/vocoder/models/hifigan_generator.py @@ -280,7 +280,7 @@ def forward(self, x, g=None): o = torch.tanh(o) return o - @torch.no_grad() + @torch.inference_mode() def inference(self, c): """ Args: diff --git a/TTS/vocoder/models/multiband_melgan_generator.py b/TTS/vocoder/models/multiband_melgan_generator.py index 25d6590659..6eee712db3 100644 --- a/TTS/vocoder/models/multiband_melgan_generator.py +++ b/TTS/vocoder/models/multiband_melgan_generator.py @@ -32,7 +32,7 @@ def pqmf_analysis(self, x): def pqmf_synthesis(self, x): return self.pqmf_layer.synthesis(x) - @torch.no_grad() + @torch.inference_mode() def inference(self, cond_features): cond_features = cond_features.to(self.layers[1].weight.device) cond_features = torch.nn.functional.pad( diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py index e60c8781f0..0659a00cc1 100644 --- a/TTS/vocoder/models/parallel_wavegan_generator.py +++ b/TTS/vocoder/models/parallel_wavegan_generator.py @@ -127,7 +127,7 @@ def forward(self, c): return x - @torch.no_grad() + @torch.inference_mode() def inference(self, c): c = c.to(self.first_conv.weight.device) c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate") diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py index 5d1f817927..19f5648f4d 100644 --- a/TTS/vocoder/models/univnet_generator.py +++ b/TTS/vocoder/models/univnet_generator.py @@ -139,7 +139,7 @@ def receptive_field_size(self): """Return receptive field size.""" return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size) - @torch.no_grad() + @torch.inference_mode() def inference(self, c): """Perform inference. Args: diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index c49abd2201..d756f956dd 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -123,7 +123,7 @@ def load_noise_schedule(self, path): beta = np.load(path, allow_pickle=True).item()["beta"] # pylint: disable=unexpected-keyword-arg self.compute_noise_level(beta) - @torch.no_grad() + @torch.inference_mode() def inference(self, x, y_n=None): """ Shapes: @@ -262,7 +262,7 @@ def train_log( # pylint: disable=no-self-use ) -> Tuple[Dict, np.ndarray]: pass - @torch.no_grad() + @torch.inference_mode() def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: return self.train_step(batch, criterion) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 1847679890..4ece55af62 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -307,7 +307,7 @@ def inference(self, mels, batched=None, target=None, overlap=None): rnn1 = self.get_gru_cell(self.rnn1) rnn2 = self.get_gru_cell(self.rnn2) - with torch.no_grad(): + with torch.inference_mode(): if isinstance(mels, np.ndarray): mels = torch.FloatTensor(mels).to(str(next(self.parameters()).device)) diff --git a/pyproject.toml b/pyproject.toml index a7baf29e31..ba28618d0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -137,8 +137,8 @@ all = [ dev = [ "black==24.2.0", "coverage[toml]>=7", - "nose2>=0.15", "pre-commit>=3", + "pytest>=8", "ruff==0.7.0", ] # Dependencies for building the documentation @@ -235,10 +235,10 @@ max-returns = 7 line-length = 120 target-version = ['py39'] +[tool.coverage.report] +skip_covered = true +skip_empty = true + [tool.coverage.run] parallel = true source = ["TTS"] - -[tool.cibuildwheel] -build = "cp*" -skip = "*-win32 *i686 *musllinux*" diff --git a/run_bash_tests.sh b/run_bash_tests.sh index 2f5ba88934..5f6cd43f68 100755 --- a/run_bash_tests.sh +++ b/run_bash_tests.sh @@ -3,5 +3,4 @@ TF_CPP_MIN_LOG_LEVEL=3 # runtime bash based tests # TODO: move these to python -./tests/bash_tests/test_demo_server.sh && \ -./tests/bash_tests/test_compute_statistics.sh +./tests/bash_tests/test_demo_server.sh diff --git a/tests/__init__.py b/tests/__init__.py index f0a8b2f118..8108bdeb50 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,5 +1,7 @@ import os +from typing import Callable, Optional +import pytest from trainer.generic_utils import get_cuda from TTS.config import BaseDatasetConfig @@ -44,6 +46,12 @@ def run_cli(command): assert exit_status == 0, f" [!] command `{command}` failed." +def run_main(main_func: Callable, args: Optional[list[str]] = None, expected_code: int = 0): + with pytest.raises(SystemExit) as exc_info: + main_func(args) + assert exc_info.value.code == expected_code + + def get_test_data_config(): return BaseDatasetConfig(formatter="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv") diff --git a/tests/aux_tests/test_audio_processor.py b/tests/aux_tests/test_audio_processor.py index 5b1fa9d38a..6caf6db30d 100644 --- a/tests/aux_tests/test_audio_processor.py +++ b/tests/aux_tests/test_audio_processor.py @@ -1,190 +1,194 @@ import os -import unittest -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +import pytest + +from tests import get_tests_input_path from TTS.config import BaseAudioConfig from TTS.utils.audio.processor import AudioProcessor -TESTS_PATH = get_tests_path() -OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -os.makedirs(OUT_PATH, exist_ok=True) conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1) -# pylint: disable=protected-access -class TestAudio(unittest.TestCase): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.ap = AudioProcessor(**conf) - - def test_audio_synthesis(self): - """1. load wav - 2. set normalization parameters - 3. extract mel-spec - 4. invert to wav and save the output - """ - print(" > Sanity check for the process wav -> mel -> wav") - - def _test(max_norm, signal_norm, symmetric_norm, clip_norm): - self.ap.max_norm = max_norm - self.ap.signal_norm = signal_norm - self.ap.symmetric_norm = symmetric_norm - self.ap.clip_norm = clip_norm - wav = self.ap.load_wav(WAV_FILE) - mel = self.ap.melspectrogram(wav) - wav_ = self.ap.inv_melspectrogram(mel) - file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav".format( - max_norm, signal_norm, symmetric_norm, clip_norm - ) - print(" | > Creating wav file at : ", file_name) - self.ap.save_wav(wav_, OUT_PATH + file_name) - - # maxnorm = 1.0 - _test(1.0, False, False, False) - _test(1.0, True, False, False) - _test(1.0, True, True, False) - _test(1.0, True, False, True) - _test(1.0, True, True, True) - # maxnorm = 4.0 - _test(4.0, False, False, False) - _test(4.0, True, False, False) - _test(4.0, True, True, False) - _test(4.0, True, False, True) - _test(4.0, True, True, True) - - def test_normalize(self): - """Check normalization and denormalization for range values and consistency""" - print(" > Testing normalization and denormalization.") - wav = self.ap.load_wav(WAV_FILE) - wav = self.ap.sound_norm(wav) # normalize audio to get abetter normalization range below. - self.ap.signal_norm = False - x = self.ap.melspectrogram(wav) - x_old = x - - self.ap.signal_norm = True - self.ap.symmetric_norm = False - self.ap.clip_norm = False - self.ap.max_norm = 4.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - assert (x_old - x).sum() == 0 - # check value range - assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() - assert x_norm.min() >= 0 - 1, x_norm.min() - # check denorm. - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3, (x - x_).mean() - - self.ap.signal_norm = True - self.ap.symmetric_norm = False - self.ap.clip_norm = True - self.ap.max_norm = 4.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - # check value range - assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= 0, x_norm.min() - # check denorm. - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3, (x - x_).mean() - - self.ap.signal_norm = True - self.ap.symmetric_norm = True - self.ap.clip_norm = False - self.ap.max_norm = 4.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - # check value range - assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() # pylint: disable=invalid-unary-operand-type - assert x_norm.min() <= 0, x_norm.min() - # check denorm. - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3, (x - x_).mean() - - self.ap.signal_norm = True - self.ap.symmetric_norm = True - self.ap.clip_norm = True - self.ap.max_norm = 4.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - # check value range - assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type - assert x_norm.min() <= 0, x_norm.min() - # check denorm. - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3, (x - x_).mean() - - self.ap.signal_norm = True - self.ap.symmetric_norm = False - self.ap.max_norm = 1.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= 0, x_norm.min() - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3 - - self.ap.signal_norm = True - self.ap.symmetric_norm = True - self.ap.max_norm = 1.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type - assert x_norm.min() < 0, x_norm.min() - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3 - - def test_scaler(self): - scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy") - conf.stats_path = scaler_stats_path - conf.preemphasis = 0.0 - conf.do_trim_silence = True - conf.signal_norm = True - - ap = AudioProcessor(**conf) - mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) - ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) - - self.ap.signal_norm = False - self.ap.preemphasis = 0.0 - - # test scaler forward and backward transforms - wav = self.ap.load_wav(WAV_FILE) - mel_reference = self.ap.melspectrogram(wav) - mel_norm = ap.melspectrogram(wav) - mel_denorm = ap.denormalize(mel_norm) - assert abs(mel_reference - mel_denorm).max() < 1e-4 - - def test_compute_f0(self): # pylint: disable=no-self-use - ap = AudioProcessor(**conf) - wav = ap.load_wav(WAV_FILE) - pitch = ap.compute_f0(wav) - mel = ap.melspectrogram(wav) - assert pitch.shape[0] == mel.shape[1] +@pytest.fixture +def ap(): + """Set up audio processor.""" + return AudioProcessor(**conf) + + +norms = [ + # maxnorm = 1.0 + (1.0, False, False, False), + (1.0, True, False, False), + (1.0, True, True, False), + (1.0, True, False, True), + (1.0, True, True, True), + # maxnorm = 4.0 + (4.0, False, False, False), + (4.0, True, False, False), + (4.0, True, True, False), + (4.0, True, False, True), + (4.0, True, True, True), +] + + +@pytest.mark.parametrize("norms", norms) +def test_audio_synthesis(tmp_path, ap, norms): + """1. load wav + 2. set normalization parameters + 3. extract mel-spec + 4. invert to wav and save the output + """ + print(" > Sanity check for the process wav -> mel -> wav") + max_norm, signal_norm, symmetric_norm, clip_norm = norms + ap.max_norm = max_norm + ap.signal_norm = signal_norm + ap.symmetric_norm = symmetric_norm + ap.clip_norm = clip_norm + wav = ap.load_wav(WAV_FILE) + mel = ap.melspectrogram(wav) + wav_ = ap.inv_melspectrogram(mel) + file_name = ( + f"audio_test-melspec_max_norm_{max_norm}-signal_norm_{signal_norm}-" + f"symmetric_{symmetric_norm}-clip_norm_{clip_norm}.wav" + ) + print(" | > Creating wav file at : ", file_name) + ap.save_wav(wav_, tmp_path / file_name) + + +def test_normalize(ap): + """Check normalization and denormalization for range values and consistency""" + print(" > Testing normalization and denormalization.") + wav = ap.load_wav(WAV_FILE) + wav = ap.sound_norm(wav) # normalize audio to get abetter normalization range below. + ap.signal_norm = False + x = ap.melspectrogram(wav) + x_old = x + + ap.signal_norm = True + ap.symmetric_norm = False + ap.clip_norm = False + ap.max_norm = 4.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + assert (x_old - x).sum() == 0 + # check value range + assert x_norm.max() <= ap.max_norm + 1, x_norm.max() + assert x_norm.min() >= 0 - 1, x_norm.min() + # check denorm. + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3, (x - x_).mean() + + ap.signal_norm = True + ap.symmetric_norm = False + ap.clip_norm = True + ap.max_norm = 4.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + # check value range + assert x_norm.max() <= ap.max_norm, x_norm.max() + assert x_norm.min() >= 0, x_norm.min() + # check denorm. + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3, (x - x_).mean() + + ap.signal_norm = True + ap.symmetric_norm = True + ap.clip_norm = False + ap.max_norm = 4.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + # check value range + assert x_norm.max() <= ap.max_norm + 1, x_norm.max() + assert x_norm.min() >= -ap.max_norm - 2, x_norm.min() # pylint: disable=invalid-unary-operand-type + assert x_norm.min() <= 0, x_norm.min() + # check denorm. + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3, (x - x_).mean() + + ap.signal_norm = True + ap.symmetric_norm = True + ap.clip_norm = True + ap.max_norm = 4.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + # check value range + assert x_norm.max() <= ap.max_norm, x_norm.max() + assert x_norm.min() >= -ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type + assert x_norm.min() <= 0, x_norm.min() + # check denorm. + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3, (x - x_).mean() + + ap.signal_norm = True + ap.symmetric_norm = False + ap.max_norm = 1.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + assert x_norm.max() <= ap.max_norm, x_norm.max() + assert x_norm.min() >= 0, x_norm.min() + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3 + + ap.signal_norm = True + ap.symmetric_norm = True + ap.max_norm = 1.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + assert x_norm.max() <= ap.max_norm, x_norm.max() + assert x_norm.min() >= -ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type + assert x_norm.min() < 0, x_norm.min() + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3 + + +def test_scaler(ap): + scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy") + conf.stats_path = scaler_stats_path + conf.preemphasis = 0.0 + conf.do_trim_silence = True + conf.signal_norm = True + + ap = AudioProcessor(**conf) + mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) + ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) + + ap.signal_norm = False + ap.preemphasis = 0.0 + + # test scaler forward and backward transforms + wav = ap.load_wav(WAV_FILE) + mel_reference = ap.melspectrogram(wav) + mel_norm = ap.melspectrogram(wav) + mel_denorm = ap.denormalize(mel_norm) + assert abs(mel_reference - mel_denorm).max() < 1e-4 + + +def test_compute_f0(ap): + wav = ap.load_wav(WAV_FILE) + pitch = ap.compute_f0(wav) + mel = ap.melspectrogram(wav) + assert pitch.shape[0] == mel.shape[1] diff --git a/tests/aux_tests/test_compute_statistics.py b/tests/aux_tests/test_compute_statistics.py new file mode 100644 index 0000000000..d6809eb480 --- /dev/null +++ b/tests/aux_tests/test_compute_statistics.py @@ -0,0 +1,10 @@ +from pathlib import Path + +from tests import get_tests_input_path, run_main +from TTS.bin.compute_statistics import main + + +def test_compute_statistics(tmp_path): + config_path = Path(get_tests_input_path()) / "test_glow_tts_config.json" + output_path = tmp_path / "scale_stats.npy" + run_main(main, ["--config_path", str(config_path), "--out_path", str(output_path)]) diff --git a/tests/aux_tests/test_extract_tts_spectrograms.py b/tests/aux_tests/test_extract_tts_spectrograms.py index f2d119ac35..563c5dae02 100644 --- a/tests/aux_tests/test_extract_tts_spectrograms.py +++ b/tests/aux_tests/test_extract_tts_spectrograms.py @@ -1,67 +1,23 @@ -import os -import unittest +from pathlib import Path +import pytest import torch -from tests import get_tests_input_path, get_tests_output_path, run_cli +from tests import get_tests_input_path, run_main +from TTS.bin.extract_tts_spectrograms import main from TTS.config import load_config from TTS.tts.models import setup_model torch.manual_seed(1) -# pylint: disable=protected-access -class TestExtractTTSSpectrograms(unittest.TestCase): - @staticmethod - def test_GlowTTS(): - # set paths - config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json") - checkpoint_path = os.path.join(get_tests_output_path(), "glowtts.pth") - output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") - # load config - c = load_config(config_path) - # create model - model = setup_model(c) - # save model - torch.save({"model": model.state_dict()}, checkpoint_path) - # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' - ) - run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"') +@pytest.mark.parametrize("model", ["glow_tts", "tacotron", "tacotron2"]) +def test_extract_tts_spectrograms(tmp_path, model): + config_path = str(Path(get_tests_input_path()) / f"test_{model}_config.json") + checkpoint_path = str(tmp_path / f"{model}.pth") + output_path = str(tmp_path / "output_extract_tts_spectrograms") - @staticmethod - def test_Tacotron2(): - # set paths - config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json") - checkpoint_path = os.path.join(get_tests_output_path(), "tacotron2.pth") - output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") - # load config - c = load_config(config_path) - # create model - model = setup_model(c) - # save model - torch.save({"model": model.state_dict()}, checkpoint_path) - # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' - ) - run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"') - - @staticmethod - def test_Tacotron(): - # set paths - config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json") - checkpoint_path = os.path.join(get_tests_output_path(), "tacotron.pth") - output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") - # load config - c = load_config(config_path) - # create model - model = setup_model(c) - # save model - torch.save({"model": model.state_dict()}, checkpoint_path) - # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' - ) - run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"') + config = load_config(config_path) + model = setup_model(config) + torch.save({"model": model.state_dict()}, checkpoint_path) + run_main(main, ["--config_path", config_path, "--checkpoint_path", checkpoint_path, "--output_path", output_path]) diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py index 018679f573..53298cdebd 100644 --- a/tests/aux_tests/test_find_unique_phonemes.py +++ b/tests/aux_tests/test_find_unique_phonemes.py @@ -1,16 +1,12 @@ -import os -import unittest - import torch -from tests import get_tests_output_path, run_cli +from tests import run_main +from TTS.bin.find_unique_phonemes import main from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig torch.manual_seed(1) -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") - dataset_config_en = BaseDatasetConfig( formatter="ljspeech", meta_file_train="metadata.csv", @@ -30,52 +26,26 @@ """ -# pylint: disable=protected-access -class TestFindUniquePhonemes(unittest.TestCase): - @staticmethod - def test_espeak_phonemes(): - # prepare the config - config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - datasets=[dataset_config_en], - ) - config.save_json(config_path) - - # run test - run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"') - - @staticmethod - def test_no_espeak_phonemes(): - # prepare the config - config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - datasets=[dataset_config_en], - ) - config.save_json(config_path) - - # run test - run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"') +def test_find_phonemes(tmp_path): + # prepare the config + config_path = str(tmp_path / "test_model_config.json") + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + datasets=[dataset_config_en], + ) + config.save_json(config_path) + + # run test + run_main(main, ["--config_path", config_path]) diff --git a/tests/aux_tests/test_numpy_transforms.py b/tests/aux_tests/test_numpy_transforms.py index 00597a0f88..129ba5d86b 100644 --- a/tests/aux_tests/test_numpy_transforms.py +++ b/tests/aux_tests/test_numpy_transforms.py @@ -7,18 +7,12 @@ import numpy as np from coqpit import Coqpit -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +from tests import get_tests_input_path, get_tests_path from TTS.utils.audio import numpy_transforms as np_transforms TESTS_PATH = get_tests_path() -OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -os.makedirs(OUT_PATH, exist_ok=True) - - -# pylint: disable=no-self-use - class TestNumpyTransforms(unittest.TestCase): def setUp(self) -> None: diff --git a/tests/aux_tests/test_speaker_encoder_train.py b/tests/aux_tests/test_speaker_encoder_train.py index 5d8626faa6..0e15db2ab0 100644 --- a/tests/aux_tests/test_speaker_encoder_train.py +++ b/tests/aux_tests/test_speaker_encoder_train.py @@ -1,88 +1,86 @@ -import glob -import os import shutil -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig -def run_test_train(): - command = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " +def test_train(tmp_path): + config_path = tmp_path / "test_speaker_encoder_config.json" + output_path = tmp_path / "train_outputs" + + def run_test_train(): + command = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + ) + run_cli(command) + + config = SpeakerEncoderConfig( + batch_size=4, + num_classes_in_batch=4, + num_utter_per_class=2, + eval_num_classes_in_batch=4, + eval_num_utter_per_class=2, + num_loader_workers=1, + epochs=1, + print_step=1, + save_step=2, + print_eval=True, + run_eval=True, + audio=BaseAudioConfig(num_mels=80), ) - run_cli(command) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.loss = "ge2e" + config.save_json(config_path) + print(config) + # train the model for one epoch + run_test_train() -config_path = os.path.join(get_tests_output_path(), "test_speaker_encoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -config = SpeakerEncoderConfig( - batch_size=4, - num_classes_in_batch=4, - num_utter_per_class=2, - eval_num_classes_in_batch=4, - eval_num_utter_per_class=2, - num_loader_workers=1, - epochs=1, - print_step=1, - save_step=2, - print_eval=True, - run_eval=True, - audio=BaseAudioConfig(num_mels=80), -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.loss = "ge2e" -config.save_json(config_path) - -print(config) -# train the model for one epoch -run_test_train() - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) - -# test resnet speaker encoder -config.model_params["model_name"] = "resnet" -config.save_json(config_path) - -# train the model for one epoch -run_test_train() - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) + # test resnet speaker encoder + config.model_params["model_name"] = "resnet" + config.save_json(config_path) -# test model with ge2e loss function -# config.loss = "ge2e" -# config.save_json(config_path) -# run_test_train() + # train the model for one epoch + run_test_train() -# test model with angleproto loss function -# config.loss = "angleproto" -# config.save_json(config_path) -# run_test_train() + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# test model with softmaxproto loss function -config.loss = "softmaxproto" -config.save_json(config_path) -run_test_train() + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) + + # test model with ge2e loss function + # config.loss = "ge2e" + # config.save_json(config_path) + # run_test_train() + + # test model with angleproto loss function + # config.loss = "angleproto" + # config.save_json(config_path) + # run_test_train() + + # test model with softmaxproto loss function + config.loss = "softmaxproto" + config.save_json(config_path) + run_test_train() diff --git a/tests/bash_tests/test_compute_statistics.sh b/tests/bash_tests/test_compute_statistics.sh deleted file mode 100755 index 721777f852..0000000000 --- a/tests/bash_tests/test_compute_statistics.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash -set -xe -BASEDIR=$(dirname "$0") -echo "$BASEDIR" -# run training -CUDA_VISIBLE_DEVICES="" python TTS/bin/compute_statistics.py --config_path $BASEDIR/../inputs/test_glow_tts.json --out_path $BASEDIR/../outputs/scale_stats.npy diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 252b429a16..f260af161e 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -1,12 +1,12 @@ import os import shutil -import unittest import numpy as np +import pytest import torch from torch.utils.data import DataLoader -from tests import get_tests_data_path, get_tests_output_path +from tests import get_tests_data_path from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets.dataset import TTSDataset @@ -15,9 +15,6 @@ # pylint: disable=unused-variable -OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") -os.makedirs(OUTPATH, exist_ok=True) - # create a dummy config for testing data loaders. c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False) c.r = 5 @@ -47,6 +44,9 @@ dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac] +ap = AudioProcessor(**c.audio) +max_loader_iter = 4 + DATA_EXIST = True if not os.path.exists(c.data_path): DATA_EXIST = False @@ -54,203 +54,200 @@ print(" > Dynamic data loader test: {}".format(DATA_EXIST)) -class TestTTSDataset(unittest.TestCase): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.max_loader_iter = 4 - self.ap = AudioProcessor(**c.audio) - - def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False): - # load dataset - meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) - items = meta_data_train + meta_data_eval - tokenizer, _ = TTSTokenizer.init_from_config(c) - dataset = TTSDataset( - outputs_per_step=r, - compute_linear_spec=True, - return_wav=True, - tokenizer=tokenizer, - ap=self.ap, - samples=items, - batch_group_size=bgs, - min_text_len=c.min_text_len, - max_text_len=c.max_text_len, - min_audio_len=c.min_audio_len, - max_audio_len=c.max_audio_len, - start_by_longest=start_by_longest, - ) - - # add preprocess to force the length computation - if preprocess_samples: - dataset.preprocess_samples() - - dataloader = DataLoader( - dataset, - batch_size=batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=True, - num_workers=c.num_loader_workers, - ) - return dataloader, dataset - - def test_loader(self): - for dataset_config in dataset_configs: - dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True) - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - text_input = data["token_id"] - _ = data["token_id_lengths"] - speaker_name = data["speaker_names"] - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - _ = data["stop_targets"] - _ = data["item_idxs"] - wavs = data["waveform"] - - neg_values = text_input[text_input < 0] - check_count = len(neg_values) - - # check basic conditions - self.assertEqual(check_count, 0) - self.assertEqual(linear_input.shape[0], mel_input.shape[0], c.batch_size) - self.assertEqual(linear_input.shape[2], self.ap.fft_size // 2 + 1) - self.assertEqual(mel_input.shape[2], c.audio["num_mels"]) - self.assertEqual(wavs.shape[1], mel_input.shape[1] * c.audio.hop_length) - self.assertIsInstance(speaker_name[0], str) - - # make sure that the computed mels and the waveform match and correctly computed - mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy()) - # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding - mel_new = mel_new[:, : mel_lengths[0]] - ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) - mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg] - self.assertLess(abs(mel_diff.sum()), 1e-5) - - # check normalization ranges - if self.ap.symmetric_norm: - self.assertLessEqual(mel_input.max(), self.ap.max_norm) - self.assertGreaterEqual( - mel_input.min(), -self.ap.max_norm # pylint: disable=invalid-unary-operand-type - ) - self.assertLess(mel_input.min(), 0) - else: - self.assertLessEqual(mel_input.max(), self.ap.max_norm) - self.assertGreaterEqual(mel_input.min(), 0) - - def test_batch_group_shuffle(self): - dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav) - last_length = 0 - frames = dataset.samples - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - avg_length = mel_lengths.numpy().mean() - dataloader.dataset.preprocess_samples() - is_items_reordered = False - for idx, item in enumerate(dataloader.dataset.samples): - if item != frames[idx]: - is_items_reordered = True - break - self.assertGreaterEqual(avg_length, last_length) - self.assertTrue(is_items_reordered) - - def test_start_by_longest(self): - """Test start_by_longest option. - - Ther first item of the fist batch must be longer than all the other items. - """ - dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True) - dataloader.dataset.preprocess_samples() - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - if i == 0: - max_len = mel_lengths[0] - print(mel_lengths) - self.assertTrue(all(max_len >= mel_lengths)) - - def test_padding_and_spectrograms(self): - def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): - self.assertNotEqual(linear_input[idx, -1].sum(), 0) # check padding - self.assertNotEqual(linear_input[idx, -2].sum(), 0) - self.assertNotEqual(mel_input[idx, -1].sum(), 0) - self.assertNotEqual(mel_input[idx, -2].sum(), 0) - self.assertEqual(stop_target[idx, -1], 1) - self.assertEqual(stop_target[idx, -2], 0) - self.assertEqual(stop_target[idx].sum(), 1) - self.assertEqual(len(mel_lengths.shape), 1) - self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0]) - self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0]) - - dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav) - - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] - - # check mel_spec consistency - wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype("float32") - mel = torch.FloatTensor(mel).contiguous() - mel_dl = mel_input[0] - # NOTE: Below needs to check == 0 but due to an unknown reason - # there is a slight difference between two matrices. - # TODO: Check this assert cond more in detail. - self.assertLess(abs(mel.T - mel_dl).max(), 1e-5) - - # check mel-spec correctness - mel_spec = mel_input[0].cpu().numpy() - wav = self.ap.inv_melspectrogram(mel_spec.T) - self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") - - # check linear-spec - linear_spec = linear_input[0].cpu().numpy() - wav = self.ap.inv_spectrogram(linear_spec.T) - self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") - - # check the outputs - check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) - - # Test for batch size 2 - dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav) - - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] - - # set id to the longest sequence in the batch - if mel_lengths[0] > mel_lengths[1]: - idx = 0 - else: - idx = 1 - - # check the longer item in the batch - check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) - - # check the other item in the batch - self.assertEqual(linear_input[1 - idx, -1].sum(), 0) - self.assertEqual(mel_input[1 - idx, -1].sum(), 0) - self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1) - self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1]) - self.assertEqual(len(mel_lengths.shape), 1) - - # check batch zero-frame conditions (zero-frame disabled) - # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 - # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 +def _create_dataloader(batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False): + # load dataset + meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) + items = meta_data_train + meta_data_eval + tokenizer, _ = TTSTokenizer.init_from_config(c) + dataset = TTSDataset( + outputs_per_step=r, + compute_linear_spec=True, + return_wav=True, + tokenizer=tokenizer, + ap=ap, + samples=items, + batch_group_size=bgs, + min_text_len=c.min_text_len, + max_text_len=c.max_text_len, + min_audio_len=c.min_audio_len, + max_audio_len=c.max_audio_len, + start_by_longest=start_by_longest, + ) + + # add preprocess to force the length computation + if preprocess_samples: + dataset.preprocess_samples() + + dataloader = DataLoader( + dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=dataset.collate_fn, + drop_last=True, + num_workers=c.num_loader_workers, + ) + return dataloader, dataset + + +@pytest.mark.parametrize("dataset_config", dataset_configs) +def test_loader(dataset_config: BaseDatasetConfig): + batch_size = 1 + dataloader, _ = _create_dataloader(batch_size, 1, 0, dataset_config, preprocess_samples=True) + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + text_input = data["token_id"] + _ = data["token_id_lengths"] + speaker_name = data["speaker_names"] + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + _ = data["stop_targets"] + _ = data["item_idxs"] + wavs = data["waveform"] + + neg_values = text_input[text_input < 0] + check_count = len(neg_values) + + # check basic conditions + assert check_count == 0 + assert linear_input.shape[0] == mel_input.shape[0] == batch_size + assert linear_input.shape[2] == ap.fft_size // 2 + 1 + assert mel_input.shape[2] == c.audio["num_mels"] + assert wavs.shape[1] == mel_input.shape[1] * c.audio.hop_length + assert isinstance(speaker_name[0], str) + + # make sure that the computed mels and the waveform match and correctly computed + mel_new = ap.melspectrogram(wavs[0].squeeze().numpy()) + # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding + mel_new = mel_new[:, : mel_lengths[0]] + ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) + mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg] + assert abs(mel_diff.sum()) < 1e-5 + + # check normalization ranges + if ap.symmetric_norm: + assert mel_input.max() <= ap.max_norm + assert mel_input.min() >= -ap.max_norm + assert mel_input.min() < 0 + else: + assert mel_input.max() <= ap.max_norm + assert mel_input.min() >= 0 + + +def test_batch_group_shuffle(): + dataloader, dataset = _create_dataloader(2, c.r, 16, dataset_config_wav) + last_length = 0 + frames = dataset.samples + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + mel_lengths = data["mel_lengths"] + avg_length = mel_lengths.numpy().mean() + dataloader.dataset.preprocess_samples() + is_items_reordered = False + for idx, item in enumerate(dataloader.dataset.samples): + if item != frames[idx]: + is_items_reordered = True + break + assert avg_length >= last_length + assert is_items_reordered + + +def test_start_by_longest(): + """Test start_by_longest option. + + The first item of the fist batch must be longer than all the other items. + """ + dataloader, _ = _create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True) + dataloader.dataset.preprocess_samples() + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + mel_lengths = data["mel_lengths"] + if i == 0: + max_len = mel_lengths[0] + print(mel_lengths) + assert all(max_len >= mel_lengths) + + +def test_padding_and_spectrograms(tmp_path): + def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): + assert linear_input[idx, -1].sum() != 0 # check padding + assert linear_input[idx, -2].sum() != 0 + assert mel_input[idx, -1].sum() != 0 + assert mel_input[idx, -2].sum() != 0 + assert stop_target[idx, -1] == 1 + assert stop_target[idx, -2] == 0 + assert stop_target[idx].sum() == 1 + assert len(mel_lengths.shape) == 1 + assert mel_lengths[idx] == linear_input[idx].shape[0] + assert mel_lengths[idx] == mel_input[idx].shape[0] + + dataloader, _ = _create_dataloader(1, 1, 0, dataset_config_wav) + + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] + + # check mel_spec consistency + wav = np.asarray(ap.load_wav(item_idx[0]), dtype=np.float32) + mel = ap.melspectrogram(wav).astype("float32") + mel = torch.FloatTensor(mel).contiguous() + mel_dl = mel_input[0] + # NOTE: Below needs to check == 0 but due to an unknown reason + # there is a slight difference between two matrices. + # TODO: Check this assert cond more in detail. + assert abs(mel.T - mel_dl).max() < 1e-5 + + # check mel-spec correctness + mel_spec = mel_input[0].cpu().numpy() + wav = ap.inv_melspectrogram(mel_spec.T) + ap.save_wav(wav, tmp_path / "mel_inv_dataloader.wav") + shutil.copy(item_idx[0], tmp_path / "mel_target_dataloader.wav") + + # check linear-spec + linear_spec = linear_input[0].cpu().numpy() + wav = ap.inv_spectrogram(linear_spec.T) + ap.save_wav(wav, tmp_path / "linear_inv_dataloader.wav") + shutil.copy(item_idx[0], tmp_path / "linear_target_dataloader.wav") + + # check the outputs + check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) + + # Test for batch size 2 + dataloader, _ = _create_dataloader(2, 1, 0, dataset_config_wav) + + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] + + # set id to the longest sequence in the batch + if mel_lengths[0] > mel_lengths[1]: + idx = 0 + else: + idx = 1 + + # check the longer item in the batch + check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) + + # check the other item in the batch + assert linear_input[1 - idx, -1].sum() == 0 + assert mel_input[1 - idx, -1].sum() == 0 + assert stop_target[1, mel_lengths[1] - 1] == 1 + assert stop_target[1, mel_lengths[1] :].sum() == stop_target.shape[1] - mel_lengths[1] + assert len(mel_lengths.shape) == 1 + + # check batch zero-frame conditions (zero-frame disabled) + # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 + # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py index 28a4088c96..beb7df689b 100644 --- a/tests/inference_tests/test_synthesize.py +++ b/tests/inference_tests/test_synthesize.py @@ -1,20 +1,17 @@ -import os +from tests import run_main +from TTS.bin.synthesize import main -from tests import get_tests_output_path, run_cli - -def test_synthesize(): +def test_synthesize(tmp_path): """Test synthesize.py with diffent arguments.""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - run_cli("tts --list_models") + output_path = str(tmp_path / "output.wav") + + run_main(main, ["--list_models"]) # single speaker model - run_cli(f'tts --text "This is an example." --out_path "{output_path}"') - run_cli( - "tts --model_name tts_models/en/ljspeech/glow-tts " f'--text "This is an example." --out_path "{output_path}"' - ) - run_cli( - "tts --model_name tts_models/en/ljspeech/glow-tts " - "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan " - f'--text "This is an example." --out_path "{output_path}"' - ) + args = ["--text", "This is an example.", "--out_path", output_path] + run_main(main, args) + + args = [*args, "--model_name", "tts_models/en/ljspeech/glow-tts"] + run_main(main, args) + run_main(main, [*args, "--vocoder_name", "vocoder_models/en/ljspeech/multiband-melgan"]) diff --git a/tests/inputs/test_align_tts.json b/tests/inputs/test_align_tts_config.json similarity index 100% rename from tests/inputs/test_align_tts.json rename to tests/inputs/test_align_tts_config.json diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts_config.json similarity index 100% rename from tests/inputs/test_glow_tts.json rename to tests/inputs/test_glow_tts_config.json diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech_config.json similarity index 100% rename from tests/inputs/test_speedy_speech.json rename to tests/inputs/test_speedy_speech_config.json diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad_config.json similarity index 100% rename from tests/inputs/test_vocoder_wavegrad.json rename to tests/inputs/test_vocoder_wavegrad_config.json diff --git a/tests/tts_tests/test_neuralhmm_tts_train.py b/tests/tts_tests/test_neuralhmm_tts_train.py index 4789d53d9e..f4b8d5cadd 100644 --- a/tests/tts_tests/test_neuralhmm_tts_train.py +++ b/tests/tts_tests/test_neuralhmm_tts_train.py @@ -1,92 +1,92 @@ -import glob import json -import os import shutil import torch from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.neuralhmm_tts_config import NeuralhmmTTSConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt") -torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + parameter_path = tmp_path / "lj_parameters.pt" -config = NeuralhmmTTSConfig( - batch_size=3, - eval_batch_size=3, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="phoneme_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - mel_statistics_parameter_path=parameter_path, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - max_sampling_time=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path) + config = NeuralhmmTTSConfig( + batch_size=3, + eval_batch_size=3, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="phoneme_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + mel_statistics_parameter_path=parameter_path, + epochs=1, + print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], + print_eval=True, + max_sampling_time=50, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch when mel parameters exists -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + # train the model for one epoch when mel parameters exists + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) + # train the model for one epoch when mel parameters have to be computed from the dataset + if parameter_path.is_file(): + parameter_path.unlink() + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) -# train the model for one epoch when mel parameters have to be computed from the dataset -if os.path.exists(parameter_path): - os.remove(parameter_path) -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_overflow_train.py b/tests/tts_tests/test_overflow_train.py index d86bde6854..e2dec3c899 100644 --- a/tests/tts_tests/test_overflow_train.py +++ b/tests/tts_tests/test_overflow_train.py @@ -1,92 +1,92 @@ -import glob import json -import os import shutil import torch from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.overflow_config import OverflowConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt") -torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + parameter_path = tmp_path / "lj_parameters.pt" -config = OverflowConfig( - batch_size=3, - eval_batch_size=3, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="phoneme_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - mel_statistics_parameter_path=parameter_path, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - max_sampling_time=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path) + config = OverflowConfig( + batch_size=3, + eval_batch_size=3, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="phoneme_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + mel_statistics_parameter_path=parameter_path, + epochs=1, + print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], + print_eval=True, + max_sampling_time=50, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch when mel parameters exists -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + # train the model for one epoch when mel parameters exists + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) + # train the model for one epoch when mel parameters have to be computed from the dataset + if parameter_path.is_file(): + parameter_path.unlink() + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) -# train the model for one epoch when mel parameters have to be computed from the dataset -if os.path.exists(parameter_path): - os.remove(parameter_path) -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 2aac7f101d..30efe38d9f 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -1,72 +1,73 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig -config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_speedy_speech_config.json" + output_path = tmp_path / "train_outputs" -config = SpeedySpeechConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = SpeedySpeechConfig( + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py index d2d1d5c35f..191e0a19ee 100644 --- a/tests/tts_tests/test_tacotron2_d-vectors_train.py +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -1,79 +1,81 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.tacotron2_config import Tacotron2Config -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -config = Tacotron2Config( - r=5, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - use_speaker_embedding=False, - use_d_vector_file=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - d_vector_file="tests/data/ljspeech/speakers.json", - d_vector_dim=256, - max_decoder_steps=50, -) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = Tacotron2Config( + r=5, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + use_speaker_embedding=False, + use_d_vector_file=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + d_vector_file="tests/data/ljspeech/speakers.json", + d_vector_dim=256, + max_decoder_steps=50, + ) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = config.d_vector_file + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = config.d_vector_file -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py index 83a07d1a6c..2696edb1b6 100644 --- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py +++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py @@ -1,77 +1,79 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.tacotron2_config import Tacotron2Config -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -config = Tacotron2Config( - r=5, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - use_speaker_embedding=True, - num_speakers=4, - max_decoder_steps=50, -) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = Tacotron2Config( + r=5, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + use_speaker_embedding=True, + num_speakers=4, + max_decoder_steps=50, + ) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = continue_path / "speakers.json" -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index df0e934d8e..f8667b6d02 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -1,72 +1,72 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.tacotron2_config import Tacotron2Config -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -config = Tacotron2Config( - r=5, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - max_decoder_steps=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + config = Tacotron2Config( + r=5, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], + print_eval=True, + max_decoder_steps=50, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 17f1fd46a6..cc91b18c34 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -1,64 +1,63 @@ -import glob -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.tacotron_config import TacotronConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = TacotronConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - r=5, - max_decoder_steps=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + config = TacotronConfig( + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], + print_eval=True, + r=5, + max_decoder_steps=50, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) + + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py index 741bda91e9..b95e1deed3 100644 --- a/tests/tts_tests/test_vits_d-vectors_train.py +++ b/tests/tts_tests/test_vits_d-vectors_train.py @@ -1,61 +1,61 @@ -import glob -import os import shutil -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.vits_config import VitsConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-0"], - ], -) -# set audio config -config.audio.do_trim_silence = True -config.audio.trim_db = 60 - -# active multispeaker d-vec mode -config.model_args.use_d_vector_file = True -config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] -config.model_args.d_vector_dim = 256 - - -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-0"], + ], + ) + # set audio config + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + + # active multispeaker d-vec mode + config.model_args.use_d_vector_file = True + config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] + config.model_args.d_vector_dim = 256 + + config.save_json(config_path) + + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py index 09df7d29f2..189e6cfb4d 100644 --- a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py @@ -1,110 +1,111 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -dataset_config_en = BaseDatasetConfig( - formatter="ljspeech", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - path="tests/data/ljspeech", - language="en", -) - -dataset_config_pt = BaseDatasetConfig( - formatter="ljspeech", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - path="tests/data/ljspeech", - language="pt-br", -) - -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech", None, "en"], - ["Be a voice, not an echo.", "ljspeech", None, "pt-br"], - ], - datasets=[dataset_config_en, dataset_config_pt], -) -# set audio config -config.audio.do_trim_silence = True -config.audio.trim_db = 60 - -# active multilingual mode -config.model_args.use_language_embedding = True -config.use_language_embedding = True -# active multispeaker mode -config.model_args.use_speaker_embedding = True -config.use_speaker_embedding = True - -# deactivate multispeaker d-vec mode -config.model_args.use_d_vector_file = False -config.use_d_vector_file = False - -# duration predictor -config.model_args.use_sdp = False -config.use_sdp = False - -# active language sampler -config.use_language_weighted_sampler = True - -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech" -languae_id = "en" -continue_speakers_path = os.path.join(continue_path, "speakers.json") -continue_languages_path = os.path.join(continue_path, "language_ids.json") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + dataset_config_en = BaseDatasetConfig( + formatter="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", + ) + + dataset_config_pt = BaseDatasetConfig( + formatter="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="pt-br", + ) + + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech", None, "en"], + ["Be a voice, not an echo.", "ljspeech", None, "pt-br"], + ], + datasets=[dataset_config_en, dataset_config_pt], + ) + # set audio config + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + + # active multilingual mode + config.model_args.use_language_embedding = True + config.use_language_embedding = True + # active multispeaker mode + config.model_args.use_speaker_embedding = True + config.use_speaker_embedding = True + + # deactivate multispeaker d-vec mode + config.model_args.use_d_vector_file = False + config.use_d_vector_file = False + + # duration predictor + config.model_args.use_sdp = False + config.use_sdp = False + + # active language sampler + config.use_language_weighted_sampler = True + + config.save_json(config_path) + + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech" + languae_id = "en" + continue_speakers_path = continue_path / "speakers.json" + continue_languages_path = continue_path / "language_ids.json" + + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 + + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py index 7ae09c0e5c..8b8757422c 100644 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -1,117 +1,117 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -dataset_config_en = BaseDatasetConfig( - formatter="ljspeech_test", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - path="tests/data/ljspeech", - language="en", -) - -dataset_config_pt = BaseDatasetConfig( - formatter="ljspeech_test", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - path="tests/data/ljspeech", - language="pt-br", -) - -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="multilingual_cleaners", - use_phonemes=False, - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-0", None, "en"], - ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"], - ], - datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt], -) -# set audio config -config.audio.do_trim_silence = True -config.audio.trim_db = 60 - -# active multilingual mode -config.model_args.use_language_embedding = True -config.use_language_embedding = True - -# deactivate multispeaker mode -config.model_args.use_speaker_embedding = False -config.use_speaker_embedding = False - -# active multispeaker d-vec mode -config.model_args.use_d_vector_file = True -config.use_d_vector_file = True -config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] -config.d_vector_file = ["tests/data/ljspeech/speakers.json"] -config.model_args.d_vector_dim = 256 -config.d_vector_dim = 256 - -# duration predictor -config.model_args.use_sdp = True -config.use_sdp = True - -# activate language and speaker samplers -config.use_language_weighted_sampler = True -config.language_weighted_sampler_alpha = 10 -config.use_speaker_weighted_sampler = True -config.speaker_weighted_sampler_alpha = 5 - -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -languae_id = "en" -continue_speakers_path = config.d_vector_file -continue_languages_path = os.path.join(continue_path, "language_ids.json") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + dataset_config_en = BaseDatasetConfig( + formatter="ljspeech_test", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", + ) + + dataset_config_pt = BaseDatasetConfig( + formatter="ljspeech_test", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="pt-br", + ) + + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="multilingual_cleaners", + use_phonemes=False, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-0", None, "en"], + ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"], + ], + datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt], + ) + # set audio config + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + + # active multilingual mode + config.model_args.use_language_embedding = True + config.use_language_embedding = True + + # deactivate multispeaker mode + config.model_args.use_speaker_embedding = False + config.use_speaker_embedding = False + + # active multispeaker d-vec mode + config.model_args.use_d_vector_file = True + config.use_d_vector_file = True + config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] + config.d_vector_file = ["tests/data/ljspeech/speakers.json"] + config.model_args.d_vector_dim = 256 + config.d_vector_dim = 256 + + # duration predictor + config.model_args.use_sdp = True + config.use_sdp = True + + # activate language and speaker samplers + config.use_language_weighted_sampler = True + config.language_weighted_sampler_alpha = 10 + config.use_speaker_weighted_sampler = True + config.speaker_weighted_sampler_alpha = 5 + + config.save_json(config_path) + + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + languae_id = "en" + continue_speakers_path = config.d_vector_file + continue_languages_path = continue_path / "language_ids.json" + + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 + + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py index 69fae21f8d..6678cca90c 100644 --- a/tests/tts_tests/test_vits_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_speaker_emb_train.py @@ -1,83 +1,83 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.vits_config import VitsConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-1"], - ], -) -# set audio config -config.audio.do_trim_silence = True -config.audio.trim_db = 60 + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-1"], + ], + ) + # set audio config + config.audio.do_trim_silence = True + config.audio.trim_db = 60 -# active multispeaker d-vec mode -config.model_args.use_speaker_embedding = True -config.model_args.use_d_vector_file = False -config.model_args.d_vector_file = None -config.model_args.d_vector_dim = 256 + # active multispeaker d-vec mode + config.model_args.use_speaker_embedding = True + config.model_args.use_d_vector_file = False + config.model_args.d_vector_file = None + config.model_args.d_vector_dim = 256 + config.save_json(config_path) -config.save_json(config_path) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = continue_path / "speakers.json" -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py index 78f42d154b..e0f7a656b0 100644 --- a/tests/tts_tests/test_vits_train.py +++ b/tests/tts_tests/test_vits_train.py @@ -1,72 +1,73 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.vits_config import VitsConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo."], - ], -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo."], + ], + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests2/test_align_tts_train.py b/tests/tts_tests2/test_align_tts_train.py index 91c3c35bc6..1582f51fd4 100644 --- a/tests/tts_tests2/test_align_tts_train.py +++ b/tests/tts_tests2/test_align_tts_train.py @@ -1,72 +1,71 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.align_tts_config import AlignTTSConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = AlignTTSConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], -) + config = AlignTTSConfig( + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + ) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(tmp_path) diff --git a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py index 1e5cd49f73..74d7a0a734 100644 --- a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py +++ b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py @@ -1,100 +1,98 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig from TTS.tts.models.delightful_tts import DelightfulTtsArgs, VocoderConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -audio_config = DelightfulTtsAudioConfig() -model_args = DelightfulTtsArgs( - use_speaker_embedding=False, d_vector_dim=256, use_d_vector_file=True, speaker_embedding_channels=256 -) - -vocoder_config = VocoderConfig() - -config = DelightfulTTSConfig( - model_args=model_args, - audio=audio_config, - vocoder=vocoder_config, - batch_size=2, - eval_batch_size=8, - compute_f0=True, - run_eval=True, - test_delay_epochs=-1, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache_delightful/", ## delightful f0 cache is incompatible with other models - epochs=1, - print_step=1, - print_eval=True, - binary_align_loss_alpha=0.0, - use_attn_priors=False, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-0"], - ], - output_path=output_path, - use_speaker_embedding=False, - use_d_vector_file=True, - d_vector_file="tests/data/ljspeech/speakers.json", - d_vector_dim=256, - speaker_embedding_channels=256, -) - -# active multispeaker d-vec mode -config.model_args.use_speaker_embedding = False -config.model_args.use_d_vector_file = True -config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" -config.model_args.d_vector_dim = 256 - - -config.save_json(config_path) - -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) - -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -speaker_id = "ljspeech-1" -continue_speakers_path = config.d_vector_file - -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --speakers_file_path {continue_speakers_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) -shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + audio_config = DelightfulTtsAudioConfig() + model_args = DelightfulTtsArgs( + use_speaker_embedding=False, d_vector_dim=256, use_d_vector_file=True, speaker_embedding_channels=256 + ) + + vocoder_config = VocoderConfig() + + config = DelightfulTTSConfig( + model_args=model_args, + audio=audio_config, + vocoder=vocoder_config, + batch_size=2, + eval_batch_size=8, + compute_f0=True, + run_eval=True, + test_delay_epochs=-1, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + f0_cache_path=tmp_path / "f0_cache", # delightful f0 cache is incompatible with other models + epochs=1, + print_step=1, + print_eval=True, + binary_align_loss_alpha=0.0, + use_attn_priors=False, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-0"], + ], + output_path=output_path, + use_speaker_embedding=False, + use_d_vector_file=True, + d_vector_file="tests/data/ljspeech/speakers.json", + d_vector_dim=256, + speaker_embedding_channels=256, + ) + + # active multispeaker d-vec mode + config.model_args.use_speaker_embedding = False + config.model_args.use_d_vector_file = True + config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" + config.model_args.d_vector_dim = 256 + config.save_json(config_path) + + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = config.d_vector_file + + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 + + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --speakers_file_path {continue_speakers_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_delightful_tts_emb_spk.py b/tests/tts_tests2/test_delightful_tts_emb_spk.py index 9bbf7a55ea..68f790599e 100644 --- a/tests/tts_tests2/test_delightful_tts_emb_spk.py +++ b/tests/tts_tests2/test_delightful_tts_emb_spk.py @@ -1,94 +1,93 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig from TTS.tts.models.delightful_tts import DelightfulTtsArgs, VocoderConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -audio_config = DelightfulTtsAudioConfig() -model_args = DelightfulTtsArgs(use_speaker_embedding=False) - -vocoder_config = VocoderConfig() - -config = DelightfulTTSConfig( - model_args=model_args, - audio=audio_config, - vocoder=vocoder_config, - batch_size=2, - eval_batch_size=8, - compute_f0=True, - run_eval=True, - test_delay_epochs=-1, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache_delightful/", ## delightful f0 cache is incompatible with other models - epochs=1, - print_step=1, - print_eval=True, - binary_align_loss_alpha=0.0, - use_attn_priors=False, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech"], - ], - output_path=output_path, - num_speakers=4, - use_speaker_embedding=True, -) - -# active multispeaker d-vec mode -config.model_args.use_speaker_embedding = True -config.model_args.use_d_vector_file = False -config.model_args.d_vector_file = None -config.model_args.d_vector_dim = 256 - - -config.save_json(config_path) - -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.dataset_name ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) - -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) -shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + audio_config = DelightfulTtsAudioConfig() + model_args = DelightfulTtsArgs(use_speaker_embedding=False) + + vocoder_config = VocoderConfig() + + config = DelightfulTTSConfig( + model_args=model_args, + audio=audio_config, + vocoder=vocoder_config, + batch_size=2, + eval_batch_size=8, + compute_f0=True, + run_eval=True, + test_delay_epochs=-1, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + f0_cache_path=tmp_path / "f0_cache", # delightful f0 cache is incompatible with other models + epochs=1, + print_step=1, + print_eval=True, + binary_align_loss_alpha=0.0, + use_attn_priors=False, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech"], + ], + output_path=output_path, + num_speakers=4, + use_speaker_embedding=True, + ) + + # active multispeaker d-vec mode + config.model_args.use_speaker_embedding = True + config.model_args.use_d_vector_file = False + config.model_args.d_vector_file = None + config.model_args.d_vector_dim = 256 + config.save_json(config_path) + + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.dataset_name ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + + run_cli(command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech" + + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 + + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_delightful_tts_train.py b/tests/tts_tests2/test_delightful_tts_train.py index 3e6fbd2e86..4676ee4869 100644 --- a/tests/tts_tests2/test_delightful_tts_train.py +++ b/tests/tts_tests2/test_delightful_tts_train.py @@ -1,97 +1,97 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.delightful_tts_config import DelightfulTTSConfig from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -audio_config = DelightfulTtsAudioConfig() -model_args = DelightfulTtsArgs() + audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=60.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, + ) -vocoder_config = VocoderConfig() + audio_config = DelightfulTtsAudioConfig() + model_args = DelightfulTtsArgs() + vocoder_config = VocoderConfig() -config = DelightfulTTSConfig( - audio=audio_config, - batch_size=2, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache_delightful/", ## delightful f0 cache is incompatible with other models - run_eval=True, - test_delay_epochs=-1, - binary_align_loss_alpha=0.0, - epochs=1, - print_step=1, - use_attn_priors=False, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo."], - ], - use_speaker_embedding=False, -) -config.save_json(config_path) + config = DelightfulTTSConfig( + audio=audio_config, + batch_size=2, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + f0_cache_path=tmp_path / "f0_cache", # delightful f0 cache is incompatible with other models + run_eval=True, + test_delay_epochs=-1, + binary_align_loss_alpha=0.0, + epochs=1, + print_step=1, + use_attn_priors=False, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo."], + ], + use_speaker_embedding=False, + ) + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{'cpu'}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs -1" -) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{'cpu'}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs -1" + ) -run_cli(command_train) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == -1 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == -1 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) -shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/") + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py index e6bc9f9feb..379e2f346b 100644 --- a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py +++ b/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py @@ -1,92 +1,94 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.fast_pitch_config import FastPitchConfig -config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) +def test_train(tmp_path): + config_path = tmp_path / "fast_pitch_speaker_emb_config.json" + output_path = tmp_path / "train_outputs" -config = FastPitchConfig( - audio=audio_config, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - use_speaker_embedding=True, - test_sentences=[ - "Be a voice, not an echo.", - ], -) -config.audio.do_trim_silence = True -config.use_speaker_embedding = True -config.model_args.use_speaker_embedding = True -config.audio.trim_db = 60 -config.save_json(config_path) + audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=60.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, + ) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + config = FastPitchConfig( + audio=audio_config, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + f0_cache_path="tests/data/ljspeech/f0_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + use_speaker_embedding=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + ) + config.audio.do_trim_silence = True + config.use_speaker_embedding = True + config.model_args.use_speaker_embedding = True + config.audio.trim_db = 60 + config.save_json(config_path) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = continue_path / "speakers.json" -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_fast_pitch_train.py b/tests/tts_tests2/test_fast_pitch_train.py index fe87c8b600..e0838a2049 100644 --- a/tests/tts_tests2/test_fast_pitch_train.py +++ b/tests/tts_tests2/test_fast_pitch_train.py @@ -1,91 +1,93 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.fast_pitch_config import FastPitchConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = FastPitchConfig( - audio=audio_config, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - use_speaker_embedding=False, -) -config.audio.do_trim_silence = True -config.use_speaker_embedding = False -config.model_args.use_speaker_embedding = False -config.audio.trim_db = 60 -config.save_json(config_path) + audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=60.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, + ) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) + config = FastPitchConfig( + audio=audio_config, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + f0_cache_path="tests/data/ljspeech/f0_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + use_speaker_embedding=False, + ) + config.audio.do_trim_silence = True + config.use_speaker_embedding = False + config.model_args.use_speaker_embedding = False + config.audio.trim_db = 60 + config.save_json(config_path) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py b/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py index 735d2fc4c6..348729c6f4 100644 --- a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py +++ b/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py @@ -1,95 +1,97 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.fastspeech2_config import Fastspeech2Config -config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) +def test_train(tmp_path): + config_path = tmp_path / "fast_pitch_speaker_emb_config.json" + output_path = tmp_path / "train_outputs" -config = Fastspeech2Config( - audio=audio_config, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache/", - compute_f0=True, - compute_energy=True, - energy_cache_path="tests/data/ljspeech/energy_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - use_speaker_embedding=True, - test_sentences=[ - "Be a voice, not an echo.", - ], -) -config.audio.do_trim_silence = True -config.use_speaker_embedding = True -config.model_args.use_speaker_embedding = True -config.audio.trim_db = 60 -config.save_json(config_path) + audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=60.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, + ) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + config = Fastspeech2Config( + audio=audio_config, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + f0_cache_path="tests/data/ljspeech/f0_cache/", + compute_f0=True, + compute_energy=True, + energy_cache_path=tmp_path / "energy_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + use_speaker_embedding=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + ) + config.audio.do_trim_silence = True + config.use_speaker_embedding = True + config.model_args.use_speaker_embedding = True + config.audio.trim_db = 60 + config.save_json(config_path) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = continue_path / "speakers.json" -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_fastspeech_2_train.py b/tests/tts_tests2/test_fastspeech_2_train.py index 07fc5a1a2c..ab513ec827 100644 --- a/tests/tts_tests2/test_fastspeech_2_train.py +++ b/tests/tts_tests2/test_fastspeech_2_train.py @@ -1,94 +1,96 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.fastspeech2_config import Fastspeech2Config -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = Fastspeech2Config( - audio=audio_config, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache/", - compute_f0=True, - compute_energy=True, - energy_cache_path="tests/data/ljspeech/energy_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - use_speaker_embedding=False, -) -config.audio.do_trim_silence = True -config.use_speaker_embedding = False -config.model_args.use_speaker_embedding = False -config.audio.trim_db = 60 -config.save_json(config_path) + audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=60.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, + ) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) + config = Fastspeech2Config( + audio=audio_config, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + f0_cache_path="tests/data/ljspeech/f0_cache/", + compute_f0=True, + compute_energy=True, + energy_cache_path=output_path / "energy_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + use_speaker_embedding=False, + ) + config.audio.do_trim_silence = True + config.use_speaker_embedding = False + config.model_args.use_speaker_embedding = False + config.audio.trim_db = 60 + config.save_json(config_path) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + run_cli(command_train) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_glow_tts_d-vectors_train.py b/tests/tts_tests2/test_glow_tts_d-vectors_train.py index 8236607c25..f03139ac77 100644 --- a/tests/tts_tests2/test_glow_tts_d-vectors_train.py +++ b/tests/tts_tests2/test_glow_tts_d-vectors_train.py @@ -1,79 +1,80 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.glow_tts_config import GlowTTSConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = GlowTTSConfig( - batch_size=2, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - data_dep_init_steps=1.0, - use_speaker_embedding=False, - use_d_vector_file=True, - d_vector_file="tests/data/ljspeech/speakers.json", - d_vector_dim=256, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = GlowTTSConfig( + batch_size=2, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + data_dep_init_steps=1.0, + use_speaker_embedding=False, + use_d_vector_file=True, + d_vector_file="tests/data/ljspeech/speakers.json", + d_vector_dim=256, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = config.d_vector_file + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = config.d_vector_file -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py b/tests/tts_tests2/test_glow_tts_speaker_emb_train.py index 4a8bd0658d..b9fe93a2fa 100644 --- a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py +++ b/tests/tts_tests2/test_glow_tts_speaker_emb_train.py @@ -1,76 +1,77 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.glow_tts_config import GlowTTSConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = GlowTTSConfig( - batch_size=2, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - data_dep_init_steps=1.0, - use_speaker_embedding=True, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = GlowTTSConfig( + batch_size=2, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + data_dep_init_steps=1.0, + use_speaker_embedding=True, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + continue_speakers_path = continue_path / "speakers.json" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_glow_tts_train.py b/tests/tts_tests2/test_glow_tts_train.py index 1d7f913575..3f1bf3a794 100644 --- a/tests/tts_tests2/test_glow_tts_train.py +++ b/tests/tts_tests2/test_glow_tts_train.py @@ -1,73 +1,74 @@ -import glob import json -import os import shutil from trainer.io import get_last_checkpoint -from tests import get_device_id, get_tests_output_path, run_cli +from tests import get_device_id, run_cli from TTS.tts.configs.glow_tts_config import GlowTTSConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" -config = GlowTTSConfig( - batch_size=2, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - data_dep_init_steps=1.0, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) + config = GlowTTSConfig( + batch_size=2, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=tmp_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + data_dep_init_steps=1.0, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.formatter ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" + ) + run_cli(command_train) -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) + # Load the model and run inference + inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" + run_cli(inference_command) -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py deleted file mode 100644 index 9d4e193382..0000000000 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ /dev/null @@ -1,43 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import FullbandMelganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = FullbandMelganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py deleted file mode 100644 index c506fb48dc..0000000000 --- a/tests/vocoder_tests/test_hifigan_train.py +++ /dev/null @@ -1,43 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import HifiganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = HifiganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=1024, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py deleted file mode 100644 index 6ef9cd495b..0000000000 --- a/tests/vocoder_tests/test_melgan_train.py +++ /dev/null @@ -1,43 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import MelganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = MelganConfig( - batch_size=4, - eval_batch_size=4, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=2048, - eval_split_size=1, - print_step=1, - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py deleted file mode 100644 index 8002760706..0000000000 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ /dev/null @@ -1,44 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import MultibandMelganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = MultibandMelganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - steps_to_start_discriminator=1, - data_path="tests/data/ljspeech", - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py deleted file mode 100644 index a126befe2e..0000000000 --- a/tests/vocoder_tests/test_parallel_wavegan_train.py +++ /dev/null @@ -1,42 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import ParallelWaveganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = ParallelWaveganConfig( - batch_size=4, - eval_batch_size=4, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=2048, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_training.py b/tests/vocoder_tests/test_training.py new file mode 100644 index 0000000000..8965de01ee --- /dev/null +++ b/tests/vocoder_tests/test_training.py @@ -0,0 +1,112 @@ +import glob +import os + +import pytest + +from tests import run_main +from TTS.bin.train_vocoder import main +from TTS.vocoder.configs import ( + FullbandMelganConfig, + HifiganConfig, + MelganConfig, + MultibandMelganConfig, + ParallelWaveganConfig, + WavegradConfig, + WavernnConfig, +) +from TTS.vocoder.models.wavernn import WavernnArgs + +GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" + +BASE_CONFIG = { + "batch_size": 8, + "eval_batch_size": 8, + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "run_eval": True, + "test_delay_epochs": -1, + "epochs": 1, + "seq_len": 8192, + "eval_split_size": 1, + "print_step": 1, + "print_eval": True, + "data_path": "tests/data/ljspeech", +} + +DISCRIMINATOR_MODEL_PARAMS = { + "base_channels": 16, + "max_channels": 64, + "downsample_factors": [4, 4, 4], +} + + +def create_config(config_class, **overrides): + params = {**BASE_CONFIG, **overrides} + return config_class(**params) + + +def run_train(tmp_path, config): + config_path = str(tmp_path / "test_vocoder_config.json") + output_path = tmp_path / "train_outputs" + config.output_path = output_path + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) + + # Train the model for one epoch + run_main(main, ["--config_path", config_path]) + + # Find the latest folder + continue_path = str(max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)) + + # Restore the model and continue training for one more epoch + run_main(main, ["--continue_path", continue_path]) + + +def test_train_hifigan(tmp_path): + config = create_config(HifiganConfig, seq_len=1024) + run_train(tmp_path, config) + + +def test_train_melgan(tmp_path): + config = create_config( + MelganConfig, + batch_size=4, + eval_batch_size=4, + seq_len=2048, + discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS, + ) + run_train(tmp_path, config) + + +def test_train_multiband_melgan(tmp_path): + config = create_config( + MultibandMelganConfig, steps_to_start_discriminator=1, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS + ) + run_train(tmp_path, config) + + +def test_train_fullband_melgan(tmp_path): + config = create_config(FullbandMelganConfig, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS) + run_train(tmp_path, config) + + +def test_train_parallel_wavegan(tmp_path): + config = create_config(ParallelWaveganConfig, batch_size=4, eval_batch_size=4, seq_len=2048) + run_train(tmp_path, config) + + +# TODO: Reactivate after improving CI run times +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Takes ~2h on CI (15min/step vs 8sec/step locally)") +def test_train_wavegrad(tmp_path): + config = create_config(WavegradConfig, test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}) + run_train(tmp_path, config) + + +def test_train_wavernn(tmp_path): + config = create_config( + WavernnConfig, + model_args=WavernnArgs(), + seq_len=256, # For shorter test time + ) + run_train(tmp_path, config) diff --git a/tests/vocoder_tests/test_vocoder_gan_datasets.py b/tests/vocoder_tests/test_vocoder_gan_datasets.py index c39d70e94c..d540667ee8 100644 --- a/tests/vocoder_tests/test_vocoder_gan_datasets.py +++ b/tests/vocoder_tests/test_vocoder_gan_datasets.py @@ -3,16 +3,12 @@ import numpy as np from torch.utils.data import DataLoader -from tests import get_tests_output_path, get_tests_path +from tests import get_tests_path from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import BaseGANVocoderConfig from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data -file_path = os.path.dirname(os.path.realpath(__file__)) -OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") -os.makedirs(OUTPATH, exist_ok=True) - C = BaseGANVocoderConfig() test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") diff --git a/tests/vocoder_tests/test_vocoder_losses.py b/tests/vocoder_tests/test_vocoder_losses.py index 95501c2d39..c9432d7f4b 100644 --- a/tests/vocoder_tests/test_vocoder_losses.py +++ b/tests/vocoder_tests/test_vocoder_losses.py @@ -2,17 +2,12 @@ import torch -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +from tests import get_tests_input_path from TTS.config import BaseAudioConfig from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import stft from TTS.vocoder.layers.losses import MelganFeatureLoss, MultiScaleSTFTLoss, STFTLoss, TorchSTFT -TESTS_PATH = get_tests_path() - -OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") -os.makedirs(OUT_PATH, exist_ok=True) - WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") ap = AudioProcessor(**BaseAudioConfig().to_dict()) diff --git a/tests/vocoder_tests/test_vocoder_pqmf.py b/tests/vocoder_tests/test_vocoder_pqmf.py index afe8d1dc8f..9be492927d 100644 --- a/tests/vocoder_tests/test_vocoder_pqmf.py +++ b/tests/vocoder_tests/test_vocoder_pqmf.py @@ -4,14 +4,13 @@ import torch from librosa.core import load -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +from tests import get_tests_input_path from TTS.vocoder.layers.pqmf import PQMF -TESTS_PATH = get_tests_path() WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -def test_pqmf(): +def test_pqmf(tmp_path): w, sr = load(WAV_FILE) layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) @@ -23,4 +22,4 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr) + sf.write(tmp_path / "pqmf_output.wav", w2_.flatten().detach(), sr) diff --git a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py index 503b4e2483..c3ae1309dc 100644 --- a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py +++ b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py @@ -1,29 +1,38 @@ import os -import shutil import numpy as np +import pytest from torch.utils.data import DataLoader -from tests import get_tests_output_path, get_tests_path +from tests import get_tests_path from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import WavernnConfig from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -file_path = os.path.dirname(os.path.realpath(__file__)) -OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") -os.makedirs(OUTPATH, exist_ok=True) - C = WavernnConfig() test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") -test_mel_feat_path = os.path.join(test_data_path, "mel") -test_quant_feat_path = os.path.join(test_data_path, "quant") -ok_ljspeech = os.path.exists(test_data_path) +params = [ + [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0], + [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4], + [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0], + [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0], + [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0], + [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2], + [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0], +] + + +@pytest.mark.parametrize("params", params) +def test_parametrized_wavernn_dataset(tmp_path, params): + """Run dataloader with given parameters and check conditions""" + print(params) + batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers = params + test_mel_feat_path = tmp_path / "mel" + test_quant_feat_path = tmp_path / "quant" -def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers): - """run dataloader with given parameters and check conditions""" ap = AudioProcessor(**C.audio) C.batch_size = batch_size @@ -31,7 +40,7 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor C.seq_len = seq_len C.data_path = test_data_path - preprocess_wav_files(test_data_path, C, ap) + preprocess_wav_files(tmp_path, C, ap) _, train_items = load_wav_feat_data(test_data_path, test_mel_feat_path, 5) dataset = WaveRNNDataset( @@ -50,35 +59,12 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor max_iter = 10 count_iter = 0 - try: - for data in loader: - x_input, mels, _ = data - expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2)) - assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" - - assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] - count_iter += 1 - if count_iter == max_iter: - break - # except AssertionError: - # shutil.rmtree(test_mel_feat_path) - # shutil.rmtree(test_quant_feat_path) - finally: - shutil.rmtree(test_mel_feat_path) - shutil.rmtree(test_quant_feat_path) - + for data in loader: + x_input, mels, _ = data + expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2)) + assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" -def test_parametrized_wavernn_dataset(): - """test dataloader with different parameters""" - params = [ - [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0], - [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4], - [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0], - [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0], - [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0], - [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2], - [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0], - ] - for param in params: - print(param) - wavernn_dataset_case(*param) + assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] + count_iter += 1 + if count_iter == max_iter: + break diff --git a/tests/vocoder_tests/test_wavegrad.py b/tests/vocoder_tests/test_wavegrad.py index 43b5f08042..7530bec426 100644 --- a/tests/vocoder_tests/test_wavegrad.py +++ b/tests/vocoder_tests/test_wavegrad.py @@ -1,5 +1,3 @@ -import unittest - import numpy as np import torch from torch import optim @@ -10,50 +8,45 @@ # pylint: disable=unused-variable torch.manual_seed(1) -use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -class WavegradTrainTest(unittest.TestCase): - def test_train_step(self): # pylint: disable=no-self-use - """Test if all layers are updated in a basic training cycle""" - input_dummy = torch.rand(8, 1, 20 * 300).to(device) - mel_spec = torch.rand(8, 80, 20).to(device) - - criterion = torch.nn.L1Loss().to(device) - args = WavegradArgs( - in_channels=80, - out_channels=1, - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], +def test_train_step(): + """Test if all layers are updated in a basic training cycle""" + torch.set_grad_enabled(True) + input_dummy = torch.rand(8, 1, 20 * 300).to(device) + mel_spec = torch.rand(8, 80, 20).to(device) + + criterion = torch.nn.L1Loss().to(device) + args = WavegradArgs( + in_channels=80, + out_channels=1, + upsample_factors=[5, 5, 3, 2, 2], + upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], + ) + config = WavegradConfig(model_params=args) + model = Wavegrad(config) + + model_ref = Wavegrad(config) + model.train() + model.to(device) + betas = np.linspace(1e-6, 1e-2, 1000) + model.compute_noise_level(betas) + model_ref.load_state_dict(model.state_dict()) + model_ref.to(device) + for param, param_ref in zip(model.parameters(), model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + optimizer = optim.Adam(model.parameters(), lr=0.001) + for _ in range(5): + y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device)) + optimizer.zero_grad() + loss = criterion(y_hat, input_dummy) + loss.backward() + optimizer.step() + # check parameter changes + for i, (param, param_ref) in enumerate(zip(model.parameters(), model_ref.parameters())): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + i, param.shape, param, param_ref ) - config = WavegradConfig(model_params=args) - model = Wavegrad(config) - - model_ref = Wavegrad(config) - model.train() - model.to(device) - betas = np.linspace(1e-6, 1e-2, 1000) - model.compute_noise_level(betas) - model_ref.load_state_dict(model.state_dict()) - model_ref.to(device) - count = 0 - for param, param_ref in zip(model.parameters(), model_ref.parameters()): - assert (param - param_ref).sum() == 0, param - count += 1 - optimizer = optim.Adam(model.parameters(), lr=0.001) - for i in range(5): - y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device)) - optimizer.zero_grad() - loss = criterion(y_hat, input_dummy) - loss.backward() - optimizer.step() - # check parameter changes - count = 0 - for param, param_ref in zip(model.parameters(), model_ref.parameters()): - # ignore pre-higway layer since it works conditional - # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) - count += 1 diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py deleted file mode 100644 index 9b10759505..0000000000 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ /dev/null @@ -1,54 +0,0 @@ -import glob -import os -import shutil -import unittest - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import WavegradConfig - - -class WavegradTrainingTest(unittest.TestCase): - # TODO: Reactivate after improving CI run times - # This test currently takes ~2h on CI (15min/step vs 8sec/step locally) - if os.getenv("GITHUB_ACTIONS") == "true": - __test__ = False - - def test_train(self): # pylint: disable=no-self-use - config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") - output_path = os.path.join(get_tests_output_path(), "train_outputs") - - config = WavegradConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, - test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, - ) - config.audio.do_trim_silence = True - config.audio.trim_db = 60 - config.save_json(config_path) - - # train the model for one epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " - ) - run_cli(command_train) - - # Find latest folder - continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - - # restore the model and continue training for one more epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " - ) - run_cli(command_train) - shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py deleted file mode 100644 index 337e24259f..0000000000 --- a/tests/vocoder_tests/test_wavernn_train.py +++ /dev/null @@ -1,45 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import WavernnConfig -from TTS.vocoder.models.wavernn import WavernnArgs - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = WavernnConfig( - model_args=WavernnArgs(), - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=256, # for shorter test time - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/xtts_tests/test_xtts_gpt_train.py b/tests/xtts_tests/test_xtts_gpt_train.py index bb592f1f2d..4d22b8102f 100644 --- a/tests/xtts_tests/test_xtts_gpt_train.py +++ b/tests/xtts_tests/test_xtts_gpt_train.py @@ -1,10 +1,9 @@ -import os -import shutil +from pathlib import Path +import pytest import torch from trainer import Trainer, TrainerArgs -from tests import get_tests_output_path from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.layers.xtts.dvae import DiscreteVAE @@ -28,37 +27,9 @@ DASHBOARD_LOGGER = "tensorboard" LOGGER_URI = None -# Set here the path that the checkpoints will be saved. Default: ./run/training/ -OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests") -os.makedirs(OUT_PATH, exist_ok=True) - -# Create DVAE checkpoint and mel_norms on test time -# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model -DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth") # DVAE checkpoint -MEL_NORM_FILE = os.path.join( - OUT_PATH, "mel_stats.pth" -) # Mel spectrogram norms, required for dvae mel spectrogram extraction -dvae = DiscreteVAE( - channels=80, - normalization=None, - positional_dims=1, - num_tokens=8192, - codebook_dim=512, - hidden_dim=512, - num_resnet_blocks=3, - kernel_size=3, - num_layers=2, - use_transposed_convs=False, -) -torch.save(dvae.state_dict(), DVAE_CHECKPOINT) -mel_stats = torch.ones(80) -torch.save(mel_stats, MEL_NORM_FILE) - - # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. TOKENIZER_FILE = "tests/inputs/xtts_vocab.json" # vocab.json file -XTTS_CHECKPOINT = None # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth" # model.pth file - +XTTS_CHECKPOINT = None # model.pth file # Training sentences generations SPEAKER_REFERENCE = [ @@ -66,99 +37,122 @@ ] # speaker reference to be used in training test sentences LANGUAGE = config_dataset.language - # Training Parameters OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False START_WITH_EVAL = False # if True it will star with evaluation BATCH_SIZE = 2 # set here the batch size GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps -# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. - - -# init args and config -model_args = GPTArgs( - max_conditioning_length=132300, # 6 secs - min_conditioning_length=66150, # 3 secs - debug_loading_failures=False, - max_wav_length=255995, # ~11.6 seconds - max_text_length=200, - mel_norm_file=MEL_NORM_FILE, - dvae_checkpoint=DVAE_CHECKPOINT, - xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune - tokenizer_file=TOKENIZER_FILE, - gpt_num_audio_tokens=8194, - gpt_start_audio_token=8192, - gpt_stop_audio_token=8193, -) -audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) -config = GPTTrainerConfig( - epochs=1, - output_path=OUT_PATH, - model_args=model_args, - run_name=RUN_NAME, - project_name=PROJECT_NAME, - run_description=""" - GPT XTTS training - """, - dashboard_logger=DASHBOARD_LOGGER, - logger_uri=LOGGER_URI, - audio=audio_config, - batch_size=BATCH_SIZE, - batch_group_size=48, - eval_batch_size=BATCH_SIZE, - num_loader_workers=8, - eval_split_max_size=256, - print_step=50, - plot_step=100, - log_model_step=1000, - save_step=10000, - save_n_checkpoints=1, - save_checkpoints=True, - # target_loss="loss", - print_eval=False, - # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. - optimizer="AdamW", - optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, - optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, - lr=5e-06, # learning rate - lr_scheduler="MultiStepLR", - # it was adjusted accordly for the new step scheme - lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, - test_sentences=[ - { - "text": "This cake is great. It's so delicious and moist.", - "speaker_wav": SPEAKER_REFERENCE, - "language": LANGUAGE, - }, - ], -) - -# init the model from config -model = GPTTrainer.init_from_config(config) +# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 +# for more efficient training. You can increase/decrease BATCH_SIZE but then set +# GRAD_ACUMM_STEPS accordingly. -# load training samples -train_samples, eval_samples = load_tts_samples( - DATASETS_CONFIG_LIST, - eval_split=True, - eval_split_max_size=config.eval_split_max_size, - eval_split_size=config.eval_split_size, -) +audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) -# init the trainer and 🚀 -trainer = Trainer( - TrainerArgs( - restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter - skip_train_epoch=False, - start_with_eval=True, - grad_accum_steps=GRAD_ACUMM_STEPS, - ), - config, - output_path=OUT_PATH, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, -) -trainer.fit() -# remove output path -shutil.rmtree(OUT_PATH) +@pytest.mark.parametrize("use_perceiver", [False, True]) +def test_xtts_gpt_train(tmp_path: Path, use_perceiver: bool): + # Create DVAE checkpoint and mel_norms on test time + # DVAE parameters: For the training we need the dvae to extract the dvae tokens, + # given that you must provide the paths for this model + DVAE_CHECKPOINT = tmp_path / "dvae.pth" + # Mel spectrogram norms for dvae mel spectrogram extraction + MEL_NORM_FILE = tmp_path / "mel_stats.pth" + dvae = DiscreteVAE( + channels=80, + normalization=None, + positional_dims=1, + num_tokens=8192, + codebook_dim=512, + hidden_dim=512, + num_resnet_blocks=3, + kernel_size=3, + num_layers=2, + use_transposed_convs=False, + ) + torch.save(dvae.state_dict(), DVAE_CHECKPOINT) + mel_stats = torch.ones(80) + torch.save(mel_stats, MEL_NORM_FILE) + + # init args and config + model_args = GPTArgs( + max_conditioning_length=132300, # 6 secs + min_conditioning_length=66150, # 3 secs + debug_loading_failures=False, + max_wav_length=255995, # ~11.6 seconds + max_text_length=200, + mel_norm_file=MEL_NORM_FILE, + dvae_checkpoint=DVAE_CHECKPOINT, + xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune + tokenizer_file=TOKENIZER_FILE, + gpt_num_audio_tokens=8194, + gpt_start_audio_token=8192, + gpt_stop_audio_token=8193, + gpt_use_perceiver_resampler=use_perceiver, + ) + + config = GPTTrainerConfig( + epochs=1, + output_path=tmp_path, + model_args=model_args, + run_name=RUN_NAME, + project_name=PROJECT_NAME, + run_description="GPT XTTS training", + dashboard_logger=DASHBOARD_LOGGER, + logger_uri=LOGGER_URI, + audio=audio_config, + batch_size=BATCH_SIZE, + batch_group_size=48, + eval_batch_size=BATCH_SIZE, + num_loader_workers=8, + eval_split_max_size=256, + print_step=50, + plot_step=100, + log_model_step=1000, + save_step=10000, + save_n_checkpoints=1, + save_checkpoints=True, + # target_loss="loss", + print_eval=False, + # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. + optimizer="AdamW", + optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, + optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, + lr=5e-06, # learning rate + lr_scheduler="MultiStepLR", + # it was adjusted accordly for the new step scheme + lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, + test_sentences=[ + { + "text": "This cake is great. It's so delicious and moist.", + "speaker_wav": SPEAKER_REFERENCE, + "language": LANGUAGE, + }, + ], + ) + + # init the model from config + model = GPTTrainer.init_from_config(config) + + # load training samples + train_samples, eval_samples = load_tts_samples( + DATASETS_CONFIG_LIST, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, + ) + + # init the trainer and 🚀 + trainer = Trainer( + TrainerArgs( + restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter + skip_train_epoch=False, + start_with_eval=True, + grad_accum_steps=GRAD_ACUMM_STEPS, + ), + config, + output_path=tmp_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + ) + trainer.fit() diff --git a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py b/tests/xtts_tests/test_xtts_v2-0_gpt_train.py deleted file mode 100644 index 454e867385..0000000000 --- a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py +++ /dev/null @@ -1,164 +0,0 @@ -import os -import shutil - -import torch -from trainer import Trainer, TrainerArgs - -from tests import get_tests_output_path -from TTS.config.shared_configs import BaseDatasetConfig -from TTS.tts.datasets import load_tts_samples -from TTS.tts.layers.xtts.dvae import DiscreteVAE -from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig -from TTS.tts.models.xtts import XttsAudioConfig - -config_dataset = BaseDatasetConfig( - formatter="ljspeech", - dataset_name="ljspeech", - path="tests/data/ljspeech/", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - language="en", -) - -DATASETS_CONFIG_LIST = [config_dataset] - -# Logging parameters -RUN_NAME = "GPT_XTTS_LJSpeech_FT" -PROJECT_NAME = "XTTS_trainer" -DASHBOARD_LOGGER = "tensorboard" -LOGGER_URI = None - -OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests") -os.makedirs(OUT_PATH, exist_ok=True) - -# Create DVAE checkpoint and mel_norms on test time -# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model -DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth") # DVAE checkpoint -# Mel spectrogram norms, required for dvae mel spectrogram extraction -MEL_NORM_FILE = os.path.join(OUT_PATH, "mel_stats.pth") -dvae = DiscreteVAE( - channels=80, - normalization=None, - positional_dims=1, - num_tokens=8192, - codebook_dim=512, - hidden_dim=512, - num_resnet_blocks=3, - kernel_size=3, - num_layers=2, - use_transposed_convs=False, -) -torch.save(dvae.state_dict(), DVAE_CHECKPOINT) -mel_stats = torch.ones(80) -torch.save(mel_stats, MEL_NORM_FILE) - - -# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. -TOKENIZER_FILE = "tests/inputs/xtts_vocab.json" # vocab.json file -XTTS_CHECKPOINT = None # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth" # model.pth file - - -# Training sentences generations -SPEAKER_REFERENCE = [ - "tests/data/ljspeech/wavs/LJ001-0002.wav" -] # speaker reference to be used in training test sentences -LANGUAGE = config_dataset.language - - -# Training Parameters -OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False -START_WITH_EVAL = False # if True it will star with evaluation -BATCH_SIZE = 2 # set here the batch size -GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps -# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. - - -# init args and config -model_args = GPTArgs( - max_conditioning_length=132300, # 6 secs - min_conditioning_length=66150, # 3 secs - debug_loading_failures=False, - max_wav_length=255995, # ~11.6 seconds - max_text_length=200, - mel_norm_file=MEL_NORM_FILE, - dvae_checkpoint=DVAE_CHECKPOINT, - xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune - tokenizer_file=TOKENIZER_FILE, - gpt_num_audio_tokens=8194, - gpt_start_audio_token=8192, - gpt_stop_audio_token=8193, - gpt_use_masking_gt_prompt_approach=True, - gpt_use_perceiver_resampler=True, -) - -audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) - -config = GPTTrainerConfig( - epochs=1, - output_path=OUT_PATH, - model_args=model_args, - run_name=RUN_NAME, - project_name=PROJECT_NAME, - run_description="GPT XTTS training", - dashboard_logger=DASHBOARD_LOGGER, - logger_uri=LOGGER_URI, - audio=audio_config, - batch_size=BATCH_SIZE, - batch_group_size=48, - eval_batch_size=BATCH_SIZE, - num_loader_workers=8, - eval_split_max_size=256, - print_step=50, - plot_step=100, - log_model_step=1000, - save_step=10000, - save_n_checkpoints=1, - save_checkpoints=True, - # target_loss="loss", - print_eval=False, - # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. - optimizer="AdamW", - optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, - optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, - lr=5e-06, # learning rate - lr_scheduler="MultiStepLR", - # it was adjusted accordly for the new step scheme - lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, - test_sentences=[ - { - "text": "This cake is great. It's so delicious and moist.", - "speaker_wav": SPEAKER_REFERENCE, - "language": LANGUAGE, - }, - ], -) - -# init the model from config -model = GPTTrainer.init_from_config(config) - -# load training samples -train_samples, eval_samples = load_tts_samples( - DATASETS_CONFIG_LIST, - eval_split=True, - eval_split_max_size=config.eval_split_max_size, - eval_split_size=config.eval_split_size, -) - -# init the trainer and 🚀 -trainer = Trainer( - TrainerArgs( - restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter - skip_train_epoch=False, - start_with_eval=True, - grad_accum_steps=GRAD_ACUMM_STEPS, - ), - config, - output_path=OUT_PATH, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, -) -trainer.fit() - -# remove output path -shutil.rmtree(OUT_PATH) diff --git a/tests/zoo_tests/test_big_models.py b/tests/zoo_tests/test_big_models.py new file mode 100644 index 0000000000..8a9780b4f0 --- /dev/null +++ b/tests/zoo_tests/test_big_models.py @@ -0,0 +1,193 @@ +"""These tests should be run locally because the models are too big for CI.""" + +import os + +import pytest +import torch + +from tests import get_tests_data_path, run_main +from TTS.bin.synthesize import main +from TTS.utils.manage import ModelManager + +GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" + + +@pytest.fixture(scope="session", autouse=True) +def set_env(): + os.environ["COQUI_TOS_AGREED"] = "1" + + +@pytest.fixture +def manager(): + """Set up model manager.""" + return ModelManager(progress_bar=False) + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_xtts(tmp_path): + """XTTS is too big to run on github actions. We need to test it locally""" + args = [ + "--model_name", + "tts_models/multilingual/multi-dataset/xtts_v1.1", + "--text", + "C'est un exemple.", + "--language_idx", + "fr", + "--out_path", + str(tmp_path / "output.wav"), + "--no-progress_bar", + "--speaker_wav", + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"), + ] + if torch.cuda.is_available(): + args.append("--use_cuda") + run_main(main, args) + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_xtts_streaming(manager): + """Testing the new inference_stream method""" + from TTS.tts.configs.xtts_config import XttsConfig + from TTS.tts.models.xtts import Xtts + + speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] + speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") + speaker_wav.append(speaker_wav_2) + model_path, _, _ = manager.download_model("tts_models/multilingual/multi-dataset/xtts_v1.1") + config = XttsConfig() + config.load_json(model_path / "config.json") + model = Xtts.init_from_config(config) + model.load_checkpoint(config, checkpoint_dir=str(model_path)) + model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) + + print("Computing speaker latents...") + gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) + + print("Inference...") + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + ) + wav_chunks = [] + for i, chunk in enumerate(chunks): + if i == 0: + assert chunk.shape[-1] > 5000 + wav_chunks.append(chunk) + assert len(wav_chunks) > 1 + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_xtts_v2(tmp_path): + """XTTS is too big to run on github actions. We need to test it locally""" + args = [ + "--model_name", + "tts_models/multilingual/multi-dataset/xtts_v2", + "--text", + "C'est un exemple.", + "--language_idx", + "fr", + "--out_path", + str(tmp_path / "output.wav"), + "--no-progress_bar", + "--speaker_wav", + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"), + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav"), + ] + if torch.cuda.is_available(): + args.append("--use_cuda") + run_main(main, args) + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_xtts_v2_streaming(manager): + """Testing the new inference_stream method""" + from TTS.tts.configs.xtts_config import XttsConfig + from TTS.tts.models.xtts import Xtts + + speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] + model_path, _, _ = manager.download_model("tts_models/multilingual/multi-dataset/xtts_v2") + config = XttsConfig() + config.load_json(model_path / "config.json") + model = Xtts.init_from_config(config) + model.load_checkpoint(config, checkpoint_dir=str(model_path)) + model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) + + print("Computing speaker latents...") + gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) + + print("Inference...") + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + ) + wav_chunks = [] + for i, chunk in enumerate(chunks): + if i == 0: + assert chunk.shape[-1] > 5000 + wav_chunks.append(chunk) + assert len(wav_chunks) > 1 + normal_len = sum([len(chunk) for chunk in wav_chunks]) + + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + speed=1.5, + ) + wav_chunks = [] + for i, chunk in enumerate(chunks): + wav_chunks.append(chunk) + fast_len = sum([len(chunk) for chunk in wav_chunks]) + + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + speed=0.66, + ) + wav_chunks = [] + for i, chunk in enumerate(chunks): + wav_chunks.append(chunk) + slow_len = sum([len(chunk) for chunk in wav_chunks]) + + assert slow_len > normal_len + assert normal_len > fast_len + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_tortoise(tmp_path): + args = [ + "--model_name", + "tts_models/en/multi-dataset/tortoise-v2", + "--text", + "This is an example.", + "--out_path", + str(tmp_path / "output.wav"), + "--no-progress_bar", + ] + if torch.cuda.is_available(): + args.append("--use_cuda") + run_main(main, args) + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_bark(tmp_path): + """Bark is too big to run on github actions. We need to test it locally""" + args = [ + "--model_name", + "tts_models/multilingual/multi-dataset/bark", + "--text", + "This is an example.", + "--out_path", + str(tmp_path / "output.wav"), + "--no-progress_bar", + ] + if torch.cuda.is_available(): + args.append("--use_cuda") + run_main(main, args) diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index 461b4fbe12..2a8ff0155e 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -2,10 +2,11 @@ import os import shutil -import torch -from trainer.io import get_user_data_dir +import pytest -from tests import get_tests_data_path, get_tests_output_path, run_cli +from tests import get_tests_data_path, run_main +from TTS.api import TTS +from TTS.bin.synthesize import main from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.manage import ModelManager @@ -18,249 +19,79 @@ ] -def run_models(offset=0, step=1): - """Check if all the models are downloadable and tts models run correctly.""" - print(" > Run synthesizer with all the models.") - output_path = os.path.join(get_tests_output_path(), "output.wav") - manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False) - model_names = [name for name in manager.list_models() if name not in MODELS_WITH_SEP_TESTS] - print("Model names:", model_names) - for model_name in model_names[offset::step]: - print(f"\n > Run - {model_name}") - model_path, _, _ = manager.download_model(model_name) - if "tts_models" in model_name: - local_download_dir = model_path.parent - # download and run the model - speaker_files = list(local_download_dir.glob("speaker*")) - language_files = list(local_download_dir.glob("language*")) - speaker_arg = "" - language_arg = "" - if len(speaker_files) > 0: - # multi-speaker model - if "speaker_ids" in speaker_files[0].stem: - speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0]) - elif "speakers" in speaker_files[0].stem: - speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0]) - speakers = list(speaker_manager.name_to_id.keys()) - if len(speakers) > 1: - speaker_arg = f'--speaker_idx "{speakers[0]}"' - if len(language_files) > 0 and "language_ids" in language_files[0].stem: - # multi-lingual model - language_manager = LanguageManager(language_ids_file_path=language_files[0]) - languages = language_manager.language_names - if len(languages) > 1: - language_arg = f'--language_idx "{languages[0]}"' - run_cli( - f'tts --model_name {model_name} --text "This is an example." ' - f'--out_path "{output_path}" {speaker_arg} {language_arg} --no-progress_bar' - ) - # remove downloaded models - shutil.rmtree(local_download_dir) - shutil.rmtree(get_user_data_dir("tts")) - elif "voice_conversion_models" in model_name: - speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") - run_cli( - f"tts --model_name {model_name} " - f'--out_path "{output_path}" --source_wav "{speaker_wav}" --target_wav "{reference_wav}" --no-progress_bar' - ) - else: - # only download the model - manager.download_model(model_name) - print(f" | > OK: {model_name}") - - -def test_xtts(): - """XTTS is too big to run on github actions. We need to test it locally""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - use_gpu = torch.cuda.is_available() - if use_gpu: - run_cli( - "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda ' - f'--speaker_wav "{speaker_wav}" --language_idx "en"' - ) +@pytest.fixture(autouse=True) +def run_around_tests(tmp_path): + """Download models to a temp folder and delete it afterwards.""" + os.environ["TTS_HOME"] = str(tmp_path) + yield + shutil.rmtree(tmp_path) + + +@pytest.fixture +def manager(tmp_path): + """Set up model manager.""" + return ModelManager(output_prefix=tmp_path, progress_bar=False) + + +# To split tests into different CI jobs +num_partitions = int(os.getenv("NUM_PARTITIONS", "1")) +partition = int(os.getenv("TEST_PARTITION", "0")) +model_names = [name for name in TTS.list_models() if name not in MODELS_WITH_SEP_TESTS] +model_names = [name for i, name in enumerate(model_names) if i % num_partitions == partition] + + +@pytest.mark.parametrize("model_name", model_names) +def test_models(tmp_path, model_name, manager): + print(f"\n > Run - {model_name}") + output_path = str(tmp_path / "output.wav") + model_path, _, _ = manager.download_model(model_name) + args = ["--model_name", model_name, "--out_path", output_path, "--no-progress_bar"] + if "tts_models" in model_name: + local_download_dir = model_path.parent + # download and run the model + speaker_files = list(local_download_dir.glob("speaker*")) + language_files = list(local_download_dir.glob("language*")) + speaker_arg = [] + language_arg = [] + if len(speaker_files) > 0: + # multi-speaker model + if "speaker_ids" in speaker_files[0].stem: + speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0]) + elif "speakers" in speaker_files[0].stem: + speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0]) + speakers = list(speaker_manager.name_to_id.keys()) + if len(speakers) > 1: + speaker_arg = ["--speaker_idx", speakers[0]] + if len(language_files) > 0 and "language_ids" in language_files[0].stem: + # multi-lingual model + language_manager = LanguageManager(language_ids_file_path=language_files[0]) + languages = language_manager.language_names + if len(languages) > 1: + language_arg = ["--language_idx", languages[0]] + run_main(main, [*args, "--text", "This is an example.", *speaker_arg, *language_arg]) + elif "voice_conversion_models" in model_name: + speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") + reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") + run_main(main, [*args, "--source_wav", speaker_wav, "--target_wav", reference_wav]) else: - run_cli( - "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar ' - f'--speaker_wav "{speaker_wav}" --language_idx "en"' - ) - - -def test_xtts_streaming(): - """Testing the new inference_stream method""" - from TTS.tts.configs.xtts_config import XttsConfig - from TTS.tts.models.xtts import Xtts - - speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] - speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") - speaker_wav.append(speaker_wav_2) - model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1") - config = XttsConfig() - config.load_json(os.path.join(model_path, "config.json")) - model = Xtts.init_from_config(config) - model.load_checkpoint(config, checkpoint_dir=model_path) - model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) + # only download the model + manager.download_model(model_name) + print(f" | > OK: {model_name}") - print("Computing speaker latents...") - gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) - - print("Inference...") - chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - ) - wav_chuncks = [] - for i, chunk in enumerate(chunks): - if i == 0: - assert chunk.shape[-1] > 5000 - wav_chuncks.append(chunk) - assert len(wav_chuncks) > 1 - - -def test_xtts_v2(): - """XTTS is too big to run on github actions. We need to test it locally""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") - use_gpu = torch.cuda.is_available() - if use_gpu: - run_cli( - "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda ' - f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' - ) - else: - run_cli( - "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar ' - f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' - ) - -def test_xtts_v2_streaming(): - """Testing the new inference_stream method""" - from TTS.tts.configs.xtts_config import XttsConfig - from TTS.tts.models.xtts import Xtts - - speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] - model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2") - config = XttsConfig() - config.load_json(os.path.join(model_path, "config.json")) - model = Xtts.init_from_config(config) - model.load_checkpoint(config, checkpoint_dir=model_path) - model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) - - print("Computing speaker latents...") - gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) - - print("Inference...") - chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - ) - wav_chuncks = [] - for i, chunk in enumerate(chunks): - if i == 0: - assert chunk.shape[-1] > 5000 - wav_chuncks.append(chunk) - assert len(wav_chuncks) > 1 - normal_len = sum([len(chunk) for chunk in wav_chuncks]) - - chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - speed=1.5, - ) - wav_chuncks = [] - for i, chunk in enumerate(chunks): - wav_chuncks.append(chunk) - fast_len = sum([len(chunk) for chunk in wav_chuncks]) - - chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - speed=0.66, - ) - wav_chuncks = [] - for i, chunk in enumerate(chunks): - wav_chuncks.append(chunk) - slow_len = sum([len(chunk) for chunk in wav_chuncks]) - - assert slow_len > normal_len - assert normal_len > fast_len - - -def test_tortoise(): - output_path = os.path.join(get_tests_output_path(), "output.wav") - use_gpu = torch.cuda.is_available() - if use_gpu: - run_cli( - f" tts --model_name tts_models/en/multi-dataset/tortoise-v2 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda' - ) - else: - run_cli( - f" tts --model_name tts_models/en/multi-dataset/tortoise-v2 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' - ) - - -def test_bark(): - """Bark is too big to run on github actions. We need to test it locally""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - use_gpu = torch.cuda.is_available() - if use_gpu: - run_cli( - f" tts --model_name tts_models/multilingual/multi-dataset/bark " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda' - ) - else: - run_cli( - f" tts --model_name tts_models/multilingual/multi-dataset/bark " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' - ) - - -def test_voice_conversion(): +def test_voice_conversion(tmp_path): print(" > Run voice conversion inference using YourTTS model.") - model_name = "tts_models/multilingual/multi-dataset/your_tts" - language_id = "en" - speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") - output_path = os.path.join(get_tests_output_path(), "output.wav") - run_cli( - f"tts --model_name {model_name}" - f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} --no-progress_bar" - ) - - -""" -These are used to split tests into different actions on Github. -""" - - -def test_models_offset_0_step_3(): - run_models(offset=0, step=3) - - -def test_models_offset_1_step_3(): - run_models(offset=1, step=3) - - -def test_models_offset_2_step_3(): - run_models(offset=2, step=3) + args = [ + "--model_name", + "tts_models/multilingual/multi-dataset/your_tts", + "--out_path", + str(tmp_path / "output.wav"), + "--speaker_wav", + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"), + "--reference_wav", + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav"), + "--language_idx", + "en", + "--no-progress_bar", + ] + run_main(main, args)