diff --git a/README.md b/README.md index 720585dbc2..594777c116 100644 --- a/README.md +++ b/README.md @@ -146,7 +146,7 @@ Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not relea You can also help us implement more models. ## Installation -🐸TTS is tested on Ubuntu 18.04 with **python >= 3.7, < 3.11.**. +🐸TTS is tested on Ubuntu 18.04 with **python >= 3.9, < 3.12.**. If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option. @@ -198,17 +198,18 @@ from TTS.api import TTS # Get device device = "cuda" if torch.cuda.is_available() else "cpu" -# List available 🐸TTS models and choose the first one -model_name = TTS().list_models()[0] +# List available 🐸TTS models +print(TTS().list_models()) + # Init TTS -tts = TTS(model_name).to(device) +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device) # Run TTS -# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language -# Text to speech with a numpy output -wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0]) +# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language +# Text to speech list of amplitude values as output +wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en") # Text to speech to a file -tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav") +tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav") ``` #### Running a single speaker model @@ -347,6 +348,18 @@ If you don't specify any models, then it uses LJSpeech based English model. $ tts --text "Text for TTS" --out_path output/path/speech.wav ``` +- Run TTS and pipe out the generated TTS wav file data: + + ``` + $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay + ``` + +- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0: + + ``` + $ tts --text "Text for TTS" --model_name "coqui_studio///" --speed 1.2 --out_path output/path/speech.wav + ``` + - Run a TTS model with its default vocoder model: ``` diff --git a/TTS/.models.json b/TTS/.models.json index ba7b5f6289..a3f73169e0 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -5,9 +5,9 @@ "xtts_v1": { "description": "XTTS-v1 by Coqui with 13 languages and cross-language voice cloning.", "hf_url": [ - "https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/model.pth", - "https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/config.json", - "https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/vocab.json" + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/model.pth", + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/config.json", + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/vocab.json", ], "default_vocoder": null, "commit": "e5140314", @@ -917,4 +917,4 @@ } } } -} \ No newline at end of file +} diff --git a/TTS/VERSION b/TTS/VERSION index 8bb22944c0..a1338f8a42 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.17.8 +0.17.9 diff --git a/TTS/api.py b/TTS/api.py index e1d167a997..dd5820f8a4 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -112,7 +112,6 @@ def is_multi_lingual(self): return self.synthesizer.tts_model.language_manager.num_languages > 1 return False - @property def speakers(self): if not self.is_multi_speaker: @@ -265,6 +264,7 @@ def tts_coqui_studio( language: str = None, emotion: str = None, speed: float = 1.0, + pipe_out = None, file_path: str = None, ) -> Union[np.ndarray, str]: """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API. @@ -281,6 +281,8 @@ def tts_coqui_studio( with "V1" model. Defaults to None. speed (float, optional): Speed of the speech. Defaults to 1.0. + pipe_out (BytesIO, optional): + Flag to stdout the generated TTS wav file for shell pipe. file_path (str, optional): Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None. @@ -294,6 +296,7 @@ def tts_coqui_studio( speaker_name=speaker_name, language=language, speed=speed, + pipe_out=pipe_out, emotion=emotion, file_path=file_path, )[0] @@ -356,6 +359,7 @@ def tts_to_file( speaker_wav: str = None, emotion: str = None, speed: float = 1.0, + pipe_out = None, file_path: str = "output.wav", **kwargs, ): @@ -377,6 +381,8 @@ def tts_to_file( Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral". speed (float, optional): Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None. + pipe_out (BytesIO, optional): + Flag to stdout the generated TTS wav file for shell pipe. file_path (str, optional): Output file path. Defaults to "output.wav". kwargs (dict, optional): @@ -386,10 +392,16 @@ def tts_to_file( if self.csapi is not None: return self.tts_coqui_studio( - text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path + text=text, + speaker_name=speaker, + language=language, + emotion=emotion, + speed=speed, + file_path=file_path, + pipe_out=pipe_out, ) wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs) - self.synthesizer.save_wav(wav=wav, path=file_path) + self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out) return file_path def voice_conversion( diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 5ff1181f4e..78a20c2566 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import argparse +import contextlib import sys from argparse import RawTextHelpFormatter @@ -59,6 +60,18 @@ $ tts --text "Text for TTS" --out_path output/path/speech.wav ``` +- Run TTS and pipe out the generated TTS wav file data: + + ``` + $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay + ``` + +- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0: + + ``` + $ tts --text "Text for TTS" --model_name "coqui_studio///" --speed 1.2 --out_path output/path/speech.wav + ``` + - Run a TTS model with its default vocoder model: ``` @@ -228,6 +241,20 @@ def main(): help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.", default=None, ) + parser.add_argument( + "--pipe_out", + help="stdout the generated TTS wav file for shell pipe.", + type=str2bool, + nargs="?", + const=True, + default=False, + ) + parser.add_argument( + "--speed", + type=float, + help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.", + default=None, + ) # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) @@ -335,167 +362,177 @@ def main(): if not any(check_args): parser.parse_args(["-h"]) - # Late-import to make things load faster - from TTS.api import TTS - from TTS.utils.manage import ModelManager - from TTS.utils.synthesizer import Synthesizer - - # load model manager - path = Path(__file__).parent / "../.models.json" - manager = ModelManager(path, progress_bar=args.progress_bar) - api = TTS() - - tts_path = None - tts_config_path = None - speakers_file_path = None - language_ids_file_path = None - vocoder_path = None - vocoder_config_path = None - encoder_path = None - encoder_config_path = None - vc_path = None - vc_config_path = None - model_dir = None - - # CASE1 #list : list pre-trained TTS models - if args.list_models: - manager.add_cs_api_models(api.list_models()) - manager.list_models() - sys.exit() - - # CASE2 #info : model info for pre-trained TTS models - if args.model_info_by_idx: - model_query = args.model_info_by_idx - manager.model_info_by_idx(model_query) - sys.exit() - - if args.model_info_by_name: - model_query_full_name = args.model_info_by_name - manager.model_info_by_full_name(model_query_full_name) - sys.exit() - - # CASE3: TTS with coqui studio models - if "coqui_studio" in args.model_name: - print(" > Using 🐸Coqui Studio model: ", args.model_name) - api = TTS(model_name=args.model_name, cs_api_model=args.cs_model) - api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path, language=args.language) - print(" > Saving output to ", args.out_path) - return - - # CASE4: load pre-trained model paths - if args.model_name is not None and not args.model_path: - model_path, config_path, model_item = manager.download_model(args.model_name) - # tts model - if model_item["model_type"] == "tts_models": - tts_path = model_path - tts_config_path = config_path - if "default_vocoder" in model_item: - args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name - - # voice conversion model - if model_item["model_type"] == "voice_conversion_models": - vc_path = model_path - vc_config_path = config_path - - # tts model with multiple files to be loaded from the directory path - if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list): - model_dir = model_path - tts_path = None - tts_config_path = None - args.vocoder_name = None - - # load vocoder - if args.vocoder_name is not None and not args.vocoder_path: - vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) - - # CASE5: set custom model paths - if args.model_path is not None: - tts_path = args.model_path - tts_config_path = args.config_path - speakers_file_path = args.speakers_file_path - language_ids_file_path = args.language_ids_file_path - - if args.vocoder_path is not None: - vocoder_path = args.vocoder_path - vocoder_config_path = args.vocoder_config_path - - if args.encoder_path is not None: - encoder_path = args.encoder_path - encoder_config_path = args.encoder_config_path - - device = args.device - if args.use_cuda: - device = "cuda" - - # load models - synthesizer = Synthesizer( - tts_path, - tts_config_path, - speakers_file_path, - language_ids_file_path, - vocoder_path, - vocoder_config_path, - encoder_path, - encoder_config_path, - vc_path, - vc_config_path, - model_dir, - args.voice_dir, - ).to(device) - - # query speaker ids of a multi-speaker model. - if args.list_speaker_idxs: - print( - " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." - ) - print(synthesizer.tts_model.speaker_manager.name_to_id) - return - - # query langauge ids of a multi-lingual model. - if args.list_language_idxs: - print( - " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." - ) - print(synthesizer.tts_model.language_manager.name_to_id) - return - - # check the arguments against a multi-speaker model. - if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): - print( - " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " - "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." - ) - return - - # RUN THE SYNTHESIS - if args.text: - print(" > Text: {}".format(args.text)) - - # kick it - if tts_path is not None: - wav = synthesizer.tts( - args.text, - speaker_name=args.speaker_idx, - language_name=args.language_idx, - speaker_wav=args.speaker_wav, - reference_wav=args.reference_wav, - style_wav=args.capacitron_style_wav, - style_text=args.capacitron_style_text, - reference_speaker_name=args.reference_speaker_idx, - ) - elif vc_path is not None: - wav = synthesizer.voice_conversion( - source_wav=args.source_wav, - target_wav=args.target_wav, - ) - elif model_dir is not None: - wav = synthesizer.tts( - args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav - ) - - # save the results - print(" > Saving output to {}".format(args.out_path)) - synthesizer.save_wav(wav, args.out_path) + pipe_out = sys.stdout if args.pipe_out else None + + with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout): + # Late-import to make things load faster + from TTS.api import TTS + from TTS.utils.manage import ModelManager + from TTS.utils.synthesizer import Synthesizer + + # load model manager + path = Path(__file__).parent / "../.models.json" + manager = ModelManager(path, progress_bar=args.progress_bar) + api = TTS() + + tts_path = None + tts_config_path = None + speakers_file_path = None + language_ids_file_path = None + vocoder_path = None + vocoder_config_path = None + encoder_path = None + encoder_config_path = None + vc_path = None + vc_config_path = None + model_dir = None + + # CASE1 #list : list pre-trained TTS models + if args.list_models: + manager.add_cs_api_models(api.list_models()) + manager.list_models() + sys.exit() + + # CASE2 #info : model info for pre-trained TTS models + if args.model_info_by_idx: + model_query = args.model_info_by_idx + manager.model_info_by_idx(model_query) + sys.exit() + + if args.model_info_by_name: + model_query_full_name = args.model_info_by_name + manager.model_info_by_full_name(model_query_full_name) + sys.exit() + + # CASE3: TTS with coqui studio models + if "coqui_studio" in args.model_name: + print(" > Using 🐸Coqui Studio model: ", args.model_name) + api = TTS(model_name=args.model_name, cs_api_model=args.cs_model) + api.tts_to_file( + text=args.text, + emotion=args.emotion, + file_path=args.out_path, + language=args.language, + speed=args.speed, + pipe_out=pipe_out, + ) + print(" > Saving output to ", args.out_path) + return + + # CASE4: load pre-trained model paths + if args.model_name is not None and not args.model_path: + model_path, config_path, model_item = manager.download_model(args.model_name) + # tts model + if model_item["model_type"] == "tts_models": + tts_path = model_path + tts_config_path = config_path + if "default_vocoder" in model_item: + args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name + + # voice conversion model + if model_item["model_type"] == "voice_conversion_models": + vc_path = model_path + vc_config_path = config_path + + # tts model with multiple files to be loaded from the directory path + if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list): + model_dir = model_path + tts_path = None + tts_config_path = None + args.vocoder_name = None + + # load vocoder + if args.vocoder_name is not None and not args.vocoder_path: + vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) + + # CASE5: set custom model paths + if args.model_path is not None: + tts_path = args.model_path + tts_config_path = args.config_path + speakers_file_path = args.speakers_file_path + language_ids_file_path = args.language_ids_file_path + + if args.vocoder_path is not None: + vocoder_path = args.vocoder_path + vocoder_config_path = args.vocoder_config_path + + if args.encoder_path is not None: + encoder_path = args.encoder_path + encoder_config_path = args.encoder_config_path + + device = args.device + if args.use_cuda: + device = "cuda" + + # load models + synthesizer = Synthesizer( + tts_path, + tts_config_path, + speakers_file_path, + language_ids_file_path, + vocoder_path, + vocoder_config_path, + encoder_path, + encoder_config_path, + vc_path, + vc_config_path, + model_dir, + args.voice_dir, + ).to(device) + + # query speaker ids of a multi-speaker model. + if args.list_speaker_idxs: + print( + " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." + ) + print(synthesizer.tts_model.speaker_manager.name_to_id) + return + + # query langauge ids of a multi-lingual model. + if args.list_language_idxs: + print( + " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." + ) + print(synthesizer.tts_model.language_manager.name_to_id) + return + + # check the arguments against a multi-speaker model. + if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): + print( + " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " + "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." + ) + return + + # RUN THE SYNTHESIS + if args.text: + print(" > Text: {}".format(args.text)) + + # kick it + if tts_path is not None: + wav = synthesizer.tts( + args.text, + speaker_name=args.speaker_idx, + language_name=args.language_idx, + speaker_wav=args.speaker_wav, + reference_wav=args.reference_wav, + style_wav=args.capacitron_style_wav, + style_text=args.capacitron_style_text, + reference_speaker_name=args.reference_speaker_idx, + ) + elif vc_path is not None: + wav = synthesizer.voice_conversion( + source_wav=args.source_wav, + target_wav=args.target_wav, + ) + elif model_dir is not None: + wav = synthesizer.tts( + args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav + ) + + # save the results + print(" > Saving output to {}".format(args.out_path)) + synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out) if __name__ == "__main__": diff --git a/TTS/cs_api.py b/TTS/cs_api.py index a36452abc9..4a44b535fd 100644 --- a/TTS/cs_api.py +++ b/TTS/cs_api.py @@ -9,6 +9,8 @@ import requests from scipy.io import wavfile +from TTS.utils.audio.numpy_transforms import save_wav + class Speaker(object): """Convert dict to object.""" @@ -288,6 +290,7 @@ def tts_to_file( speaker_id=None, emotion=None, speed=1.0, + pipe_out=None, language=None, file_path: str = None, ) -> str: @@ -300,6 +303,7 @@ def tts_to_file( speaker_id (str): Speaker ID. If None, the speaker name is used. emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". speed (float): Speed of the speech. 1.0 is normal speed. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. language (str): Language of the text. If None, the default language of the speaker is used. Language is only supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". file_path (str): Path to save the file. If None, a temporary file is created. @@ -307,7 +311,7 @@ def tts_to_file( if file_path is None: file_path = tempfile.mktemp(".wav") wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language) - wavfile.write(file_path, sr, wav) + save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out) return file_path diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index 6d1e90ca5f..9e1b1c4097 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -396,6 +396,7 @@ def _forward_encoder( - g: :math:`(B, C)` """ if hasattr(self, "emb_g"): + g = g.type(torch.LongTensor) g = self.emb_g(g) # [B, C, 1] if g is not None: g = g.unsqueeze(-1) @@ -683,9 +684,10 @@ def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): # p # encoder pass o_en, x_mask, g, _ = self._forward_encoder(x, x_mask, g) # duration predictor pass - o_dr_log = self.duration_predictor(o_en, x_mask) + o_dr_log = self.duration_predictor(o_en.squeeze(), x_mask) o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) y_lengths = o_dr.sum(1) + # pitch predictor pass o_pitch = None if self.args.use_pitch: diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py index ae44472f05..e2b71fb2fe 100644 --- a/TTS/utils/audio/numpy_transforms.py +++ b/TTS/utils/audio/numpy_transforms.py @@ -1,3 +1,4 @@ +from io import BytesIO from typing import Tuple import librosa @@ -427,16 +428,24 @@ def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False, return x -def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, **kwargs) -> None: +def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, pipe_out = None, **kwargs) -> None: """Save float waveform to a file using Scipy. Args: wav (np.ndarray): Waveform with float values in range [-1, 1] to save. path (str): Path to a output file. sr (int, optional): Sampling rate used for saving to the file. Defaults to None. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. """ wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - scipy.io.wavfile.write(path, sample_rate, wav_norm.astype(np.int16)) + + wav_norm = wav_norm.astype(np.int16) + if pipe_out: + wav_buffer = BytesIO() + scipy.io.wavfile.write(wav_buffer, sample_rate, wav_norm) + wav_buffer.seek(0) + pipe_out.buffer.write(wav_buffer.read()) + scipy.io.wavfile.write(path, sample_rate, wav_norm) def mulaw_encode(*, wav: np.ndarray, mulaw_qc: int, **kwargs) -> np.ndarray: diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index b0920dc9eb..248e15b888 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -1,3 +1,4 @@ +from io import BytesIO from typing import Dict, Tuple import librosa @@ -693,20 +694,27 @@ def load_wav(self, filename: str, sr: int = None) -> np.ndarray: x = self.rms_volume_norm(x, self.db_level) return x - def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: + def save_wav(self, wav: np.ndarray, path: str, sr: int = None, pipe_out = None) -> None: """Save a waveform to a file using Scipy. Args: wav (np.ndarray): Waveform to save. path (str): Path to a output file. sr (int, optional): Sampling rate used for saving to the file. Defaults to None. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. """ if self.do_rms_norm: wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767 else: wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16)) + wav_norm = wav_norm.astype(np.int16) + if pipe_out: + wav_buffer = BytesIO() + scipy.io.wavfile.write(wav_buffer, sr if sr else self.sample_rate, wav_norm) + wav_buffer.seek(0) + pipe_out.buffer.write(wav_buffer.read()) + scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm) def get_duration(self, filename: str) -> float: """Get the duration of a wav file using Librosa. diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 24a078f5f1..a7370cd2c9 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -235,19 +235,20 @@ def split_into_sentences(self, text) -> List[str]: """ return self.seg.segment(text) - def save_wav(self, wav: List[int], path: str) -> None: + def save_wav(self, wav: List[int], path: str, pipe_out = None) -> None: """Save the waveform as a file. Args: wav (List[int]): waveform as a list of values. path (str): output path to save the waveform. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. """ # if tensor convert to numpy if torch.is_tensor(wav): wav = wav.cpu().numpy() if isinstance(wav, list): wav = np.array(wav) - save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate) + save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out) def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]: output_wav = self.vc_model.voice_conversion(source_wav, target_wav) @@ -299,11 +300,7 @@ def tts( speaker_embedding = None speaker_id = None if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"): - # handle Neon models with single speaker. - if len(self.tts_model.speaker_manager.name_to_id) == 1: - speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0] - - elif speaker_name and isinstance(speaker_name, str): + if speaker_name and isinstance(speaker_name, str): if self.tts_config.use_d_vector_file: # get the average speaker embedding from the saved d_vectors. speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( @@ -313,7 +310,9 @@ def tts( else: # get speaker idx from the speaker name speaker_id = self.tts_model.speaker_manager.name_to_id[speaker_name] - + # handle Neon models with single speaker. + elif len(self.tts_model.speaker_manager.name_to_id) == 1: + speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0] elif not speaker_name and not speaker_wav: raise ValueError( " [!] Looks like you are using a multi-speaker model. " diff --git a/docs/source/formatting_your_dataset.md b/docs/source/formatting_your_dataset.md index 477708878f..796c7b6d06 100644 --- a/docs/source/formatting_your_dataset.md +++ b/docs/source/formatting_your_dataset.md @@ -17,19 +17,20 @@ Let's assume you created the audio clips and their transcription. You can collec ... ``` -You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each line must be delimitered by a special character separating the audio file name from the transcription. And make sure that the delimiter is not used in the transcription text. +You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each column must be delimitered by a special character separating the audio file name, the transcription and the normalized transcription. And make sure that the delimiter is not used in the transcription text. We recommend the following format delimited by `|`. In the following example, `audio1`, `audio2` refer to files `audio1.wav`, `audio2.wav` etc. ``` # metadata.txt -audio1|This is my sentence. -audio2|This is maybe my sentence. -audio3|This is certainly my sentence. -audio4|Let this be your sentence. +audio1|This is my sentence.|This is my sentence. +audio2|1469 and 1470|fourteen sixty-nine and fourteen seventy +audio3|It'll be $16 sir.|It'll be sixteen dollars sir. ... ``` +*If you don't have normalized transcriptions, you can use the same transcription for both columns. If it's your case, we recommend to use normalization later in the pipeline, either in the text cleaner or in the phonemizer.* + In the end, we have the following folder structure ``` diff --git a/docs/source/implementing_a_new_model.md b/docs/source/implementing_a_new_model.md index 134271ff16..e2a0437e9a 100644 --- a/docs/source/implementing_a_new_model.md +++ b/docs/source/implementing_a_new_model.md @@ -41,7 +41,7 @@ 6. Optionally, define `MyModelArgs`. `MyModelArgs` is a 👨‍✈️Coqpit class that sets all the class arguments of the `MyModel`. `MyModelArgs` must have - all the fields neccessary to instantiate the `MyModel`. However, for training, you need to pass `MyModelConfig` to + all the fields necessary to instantiate the `MyModel`. However, for training, you need to pass `MyModelConfig` to the model. 7. Test `MyModel`. diff --git a/docs/source/inference.md b/docs/source/inference.md index 3071be4f4d..4de9ecdd14 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -114,18 +114,24 @@ tts-server --model_name "///" \ You can run a multi-speaker and multi-lingual model in Python as ```python +import torch from TTS.api import TTS -# List available 🐸TTS models and choose the first one -model_name = TTS().list_models()[0] +# Get device +device = "cuda" if torch.cuda.is_available() else "cpu" + +# List available 🐸TTS models +print(TTS().list_models()) + # Init TTS -tts = TTS(model_name) +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device) + # Run TTS -# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language -# Text to speech with a numpy output -wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0]) +# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language +# Text to speech list of amplitude values as output +wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en") # Text to speech to a file -tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav") +tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav") ``` #### Here is an example for a single speaker model. diff --git a/docs/source/main_classes/trainer_api.md b/docs/source/main_classes/trainer_api.md index f765fff7bd..876e09e5b6 100644 --- a/docs/source/main_classes/trainer_api.md +++ b/docs/source/main_classes/trainer_api.md @@ -1,3 +1,3 @@ # Trainer API -We made the trainer a seprate project on https://github.com/coqui-ai/Trainer +We made the trainer a separate project on https://github.com/coqui-ai/Trainer diff --git a/docs/source/models/forward_tts.md b/docs/source/models/forward_tts.md index 4739496770..f8f941c2fd 100644 --- a/docs/source/models/forward_tts.md +++ b/docs/source/models/forward_tts.md @@ -12,7 +12,7 @@ Currently we provide the following pre-configured architectures: - **FastPitch:** - It uses the same FastSpeech architecture that is conditioned on fundemental frequency (f0) contours with the + It uses the same FastSpeech architecture that is conditioned on fundamental frequency (f0) contours with the promise of more expressive speech. - **SpeedySpeech:** diff --git a/notebooks/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb index 4f0dbb8e40..f9c493619d 100644 --- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb +++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb @@ -100,7 +100,7 @@ " wav_file = item[\"audio_file\"].strip()\n", " wav_files.append(wav_file)\n", " if not os.path.exists(wav_file):\n", - " print(waf_path)" + " print(wav_file)" ] }, { diff --git a/tests/api_tests/test_synthesize_api.py b/tests/api_tests/test_synthesize_api.py index a96c8beab6..084f81d489 100644 --- a/tests/api_tests/test_synthesize_api.py +++ b/tests/api_tests/test_synthesize_api.py @@ -13,3 +13,16 @@ def test_synthesize(): '--text "This is it" ' f'--out_path "{output_path}"' ) + + # 🐸 Coqui studio model with speed arg. + run_cli( + 'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" ' + '--text "This is it but slow" --speed 0.1' + f'--out_path "{output_path}"' + ) + + # test pipe_out command + run_cli( + 'tts --text "test." --pipe_out ' + f'--out_path "{output_path}" | aplay' + )