diff --git a/descriptions.py b/descriptions.py index b615304..1d366b2 100644 --- a/descriptions.py +++ b/descriptions.py @@ -21,4 +21,6 @@ infer_from_prompt_md = """ Faster than **"Infer from audio"**.
You need to **"Make prompt"** first, and upload the encoded prompt (a `.npz` file) -""" \ No newline at end of file +""" + +long_text_example = "Just a few years ago, there were no legions of deep learning scientists developing intelligent products and services at major companies and startups. When we entered the field, machine learning did not command headlines in daily newspapers. Our parents had no idea what machine learning was, let alone why we might prefer it to a career in medicine or law. Machine learning was a blue skies academic discipline whose industrial significance was limited to a narrow set of real-world applications, including speech recognition and computer vision. Moreover, many of these applications required so much domain knowledge that they were often regarded as entirely separate areas for which machine learning was one small component. At that time, neural networks—the predecessors of the deep learning methods that we focus on in this book—were generally regarded as outmoded." \ No newline at end of file diff --git a/launch-ui.py b/launch-ui.py index 747b5aa..f29f0b9 100644 --- a/launch-ui.py +++ b/launch-ui.py @@ -321,6 +321,132 @@ def infer_from_prompt(text, language, accent, prompt_file): return message, (24000, samples[0][0].cpu().numpy()) +from utils.sentence_cutter import split_text_into_sentences +@torch.no_grad() +def infer_long_text(text, prompt=None, language='auto', accent='no-accent'): + """ + For long audio generation, two modes are available. + fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence. + sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance. + """ + mode = 'fixed-prompt' + global model, audio_tokenizer, text_tokenizer, text_collater + model.to(device) + if prompt is None or prompt == "": + mode = 'sliding-window' # If no prompt is given, use sliding-window mode + sentences = split_text_into_sentences(text) + # detect language + if language == "auto-detect": + language = langid.classify(text)[0] + else: + language = token2lang[langdropdown2token[language]] + + # if initial prompt is given, encode it + if prompt is not None and prompt != "": + # load prompt + prompt_data = np.load(prompt.name) + audio_prompts = prompt_data['audio_tokens'] + text_prompts = prompt_data['text_tokens'] + lang_pr = prompt_data['lang_code'] + lang_pr = code2lang[int(lang_pr)] + + # numpy to tensor + audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device) + text_prompts = torch.tensor(text_prompts).type(torch.int32) + else: + audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device) + text_prompts = torch.zeros([1, 0]).type(torch.int32) + lang_pr = language if language != 'mix' else 'en' + if mode == 'fixed-prompt': + complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device) + for text in sentences: + text = text.replace("\n", "").strip(" ") + if text == "": + continue + lang_token = lang2token[language] + lang = token2lang[lang_token] + text = lang_token + text + lang_token + + enroll_x_lens = text_prompts.shape[-1] + logging.info(f"synthesize text: {text}") + phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip()) + text_tokens, text_tokens_lens = text_collater( + [ + phone_tokens + ] + ) + text_tokens = torch.cat([text_prompts, text_tokens], dim=-1) + text_tokens_lens += enroll_x_lens + # accent control + lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]] + encoded_frames = model.inference( + text_tokens.to(device), + text_tokens_lens.to(device), + audio_prompts, + enroll_x_lens=enroll_x_lens, + top_k=-100, + temperature=1, + prompt_language=lang_pr, + text_language=langs if accent == "no-accent" else lang, + ) + complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1) + samples = audio_tokenizer.decode( + [(complete_tokens, None)] + ) + model.to('cpu') + message = f"Cut into {len(sentences)} sentences" + return message, (24000, samples[0][0].cpu().numpy()) + elif mode == "sliding-window": + complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device) + original_audio_prompts = audio_prompts + original_text_prompts = text_prompts + for text in sentences: + text = text.replace("\n", "").strip(" ") + if text == "": + continue + lang_token = lang2token[language] + lang = token2lang[lang_token] + text = lang_token + text + lang_token + + enroll_x_lens = text_prompts.shape[-1] + logging.info(f"synthesize text: {text}") + phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip()) + text_tokens, text_tokens_lens = text_collater( + [ + phone_tokens + ] + ) + text_tokens = torch.cat([text_prompts, text_tokens], dim=-1) + text_tokens_lens += enroll_x_lens + # accent control + lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]] + encoded_frames = model.inference( + text_tokens.to(device), + text_tokens_lens.to(device), + audio_prompts, + enroll_x_lens=enroll_x_lens, + top_k=-100, + temperature=1, + prompt_language=lang_pr, + text_language=langs if accent == "no-accent" else lang, + ) + complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1) + if torch.rand(1) < 1.0: + audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:] + text_prompts = text_tokens[:, enroll_x_lens:] + else: + audio_prompts = original_audio_prompts + text_prompts = original_text_prompts + samples = audio_tokenizer.decode( + [(complete_tokens, None)] + ) + model.to('cpu') + message = f"Cut into {len(sentences)} sentences" + return message, (24000, samples[0][0].cpu().numpy()) + else: + raise ValueError(f"No such mode {mode}") + + def main(): app = gr.Blocks() with app: @@ -394,6 +520,25 @@ def main(): btn_3.click(infer_from_prompt, inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, prompt_file], outputs=[text_output_3, audio_output_3]) + with gr.Tab("Infer long text"): + gr.Markdown("This is a long text generation demo. You can use this to generate long audio. ") + with gr.Row(): + with gr.Column(): + textbox_4 = gr.TextArea(label="Text", + placeholder="Type your sentence here", + value=long_text_example, elem_id=f"tts-input") + language_dropdown_4 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect', + label='language') + accent_dropdown_4 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent', + label='accent') + prompt_file_4 = gr.File(file_count='single', file_types=['.npz'], interactive=True) + with gr.Column(): + text_output_4 = gr.TextArea(label="Message") + audio_output_4 = gr.Audio(label="Output Audio", elem_id="tts-audio") + btn_4 = gr.Button("Generate!") + btn_4.click(infer_long_text, + inputs=[textbox_4, prompt_file_4, language_dropdown_4, accent_dropdown_4], + outputs=[text_output_4, audio_output_4]) app.launch() diff --git a/utils/generation.py b/utils/generation.py index 05e932a..30ed316 100644 --- a/utils/generation.py +++ b/utils/generation.py @@ -22,6 +22,7 @@ from data.collation import get_text_token_collater from models.vallex import VALLE from utils.g2p import PhonemeBpeTokenizer +from utils.sentence_cutter import split_text_into_sentences from macros import * @@ -130,4 +131,126 @@ def generate_audio(text, prompt=None, language='auto', accent='no-accent'): [(encoded_frames.transpose(2, 1), None)] ) - return samples[0][0].cpu().numpy() \ No newline at end of file + return samples[0][0].cpu().numpy() + +@torch.no_grad() +def generate_audio_from_long_text(text, prompt=None, language='auto', accent='no-accent', mode='sliding-window'): + """ + For long audio generation, two modes are available. + fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence. + sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance. + """ + global model, codec, text_tokenizer, text_collater + if prompt is None or prompt == "": + mode = 'sliding-window' # If no prompt is given, use sliding-window mode + sentences = split_text_into_sentences(text) + # detect language + if language == "auto": + language = langid.classify(text)[0] + + # if initial prompt is given, encode it + if prompt is not None and prompt != "": + prompt_path = prompt + if not os.path.exists(prompt_path): + prompt_path = "./presets/" + prompt + ".npz" + if not os.path.exists(prompt_path): + prompt_path = "./customs/" + prompt + ".npz" + if not os.path.exists(prompt_path): + raise ValueError(f"Cannot find prompt {prompt}") + prompt_data = np.load(prompt_path) + audio_prompts = prompt_data['audio_tokens'] + text_prompts = prompt_data['text_tokens'] + lang_pr = prompt_data['lang_code'] + lang_pr = code2lang[int(lang_pr)] + + # numpy to tensor + audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device) + text_prompts = torch.tensor(text_prompts).type(torch.int32) + else: + audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device) + text_prompts = torch.zeros([1, 0]).type(torch.int32) + lang_pr = language if language != 'mix' else 'en' + if mode == 'fixed-prompt': + complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device) + for text in sentences: + text = text.replace("\n", "").strip(" ") + if text == "": + continue + lang_token = lang2token[language] + lang = token2lang[lang_token] + text = lang_token + text + lang_token + + enroll_x_lens = text_prompts.shape[-1] + logging.info(f"synthesize text: {text}") + phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip()) + text_tokens, text_tokens_lens = text_collater( + [ + phone_tokens + ] + ) + text_tokens = torch.cat([text_prompts, text_tokens], dim=-1) + text_tokens_lens += enroll_x_lens + # accent control + lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]] + encoded_frames = model.inference( + text_tokens.to(device), + text_tokens_lens.to(device), + audio_prompts, + enroll_x_lens=enroll_x_lens, + top_k=-100, + temperature=1, + prompt_language=lang_pr, + text_language=langs if accent == "no-accent" else lang, + ) + complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1) + samples = codec.decode( + [(complete_tokens, None)] + ) + return samples[0][0].cpu().numpy() + elif mode == "sliding-window": + complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device) + original_audio_prompts = audio_prompts + original_text_prompts = text_prompts + for text in sentences: + text = text.replace("\n", "").strip(" ") + if text == "": + continue + lang_token = lang2token[language] + lang = token2lang[lang_token] + text = lang_token + text + lang_token + + enroll_x_lens = text_prompts.shape[-1] + logging.info(f"synthesize text: {text}") + phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip()) + text_tokens, text_tokens_lens = text_collater( + [ + phone_tokens + ] + ) + text_tokens = torch.cat([text_prompts, text_tokens], dim=-1) + text_tokens_lens += enroll_x_lens + # accent control + lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]] + encoded_frames = model.inference( + text_tokens.to(device), + text_tokens_lens.to(device), + audio_prompts, + enroll_x_lens=enroll_x_lens, + top_k=-100, + temperature=1, + prompt_language=lang_pr, + text_language=langs if accent == "no-accent" else lang, + ) + complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1) + if torch.rand(1) < 0.5: + audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:] + text_prompts = text_tokens[:, enroll_x_lens:] + else: + audio_prompts = original_audio_prompts + text_prompts = original_text_prompts + samples = codec.decode( + [(complete_tokens, None)] + ) + return samples[0][0].cpu().numpy() + else: + raise ValueError(f"No such mode {mode}") \ No newline at end of file diff --git a/utils/sentence_cutter.py b/utils/sentence_cutter.py new file mode 100644 index 0000000..15ec197 --- /dev/null +++ b/utils/sentence_cutter.py @@ -0,0 +1,54 @@ +import nltk +import jieba +import sudachipy +import langid +langid.set_languages(['en', 'zh', 'ja']) + +def split_text_into_sentences(text): + if langid.classify(text)[0] == "en": + sentences = nltk.tokenize.sent_tokenize(text) + + return sentences + elif langid.classify(text)[0] == "zh": + sentences = [] + segs = jieba.cut(text, cut_all=False) + segs = list(segs) + start = 0 + for i, seg in enumerate(segs): + if seg in ["。", "!", "?", "……"]: + sentences.append("".join(segs[start:i + 1])) + start = i + 1 + if start < len(segs): + sentences.append("".join(segs[start:])) + + return sentences + elif langid.classify(text)[0] == "ja": + sentences = [] + tokenizer = sudachipy.Dictionary().create() + tokens = tokenizer.tokenize(text) + current_sentence = "" + + for token in tokens: + current_sentence += token.surface() + if token.part_of_speech()[0] == "補助記号" and token.part_of_speech()[1] == "句点": + sentences.append(current_sentence) + current_sentence = "" + + if current_sentence: + sentences.append(current_sentence) + + return sentences + + raise RuntimeError("It is impossible to reach here.") + +long_text = """ +This is a very long paragraph, so most TTS model is unable to handle it. Hence, we have to split it into several sentences. With the help of NLTK, we can split it into sentences. However, the punctuation is not preserved, so we have to add it back. How are we going to do write this code? Let's see. +""" + +long_text = """ +现在我们要来尝试一下中文分句。因为很不幸的是,NLTK不支持中文分句。幸运的是,我们可以使用jieba来分句。但是,jieba分句后,标点符号会丢失,所以我们要手动添加回去。我现在正在想办法把这个例句写的更长更复杂一点,来测试jieba分句的性能。嗯......省略号,感觉不太好,因为省略号不是句号,所以jieba不会把它当作句子的结尾。会这样吗?我们来试试看。 +""" + +long_text = """ +これなら、英語と中国語の分句もできる。でも、日本語はどうする?まつわ、ChatGPTに僕と教えてください。ちょーと待ってください。あ、出来た! +""" \ No newline at end of file