Skip to content

Commit

Permalink
Added long-text generation
Browse files Browse the repository at this point in the history
  • Loading branch information
Plachtaa committed Aug 22, 2023
1 parent 63ff1ba commit 69bf5b4
Show file tree
Hide file tree
Showing 4 changed files with 326 additions and 2 deletions.
4 changes: 3 additions & 1 deletion descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@
infer_from_prompt_md = """
Faster than **"Infer from audio"**.<br>
You need to **"Make prompt"** first, and upload the encoded prompt (a `.npz` file)
"""
"""

long_text_example = "Just a few years ago, there were no legions of deep learning scientists developing intelligent products and services at major companies and startups. When we entered the field, machine learning did not command headlines in daily newspapers. Our parents had no idea what machine learning was, let alone why we might prefer it to a career in medicine or law. Machine learning was a blue skies academic discipline whose industrial significance was limited to a narrow set of real-world applications, including speech recognition and computer vision. Moreover, many of these applications required so much domain knowledge that they were often regarded as entirely separate areas for which machine learning was one small component. At that time, neural networks—the predecessors of the deep learning methods that we focus on in this book—were generally regarded as outmoded."
145 changes: 145 additions & 0 deletions launch-ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,132 @@ def infer_from_prompt(text, language, accent, prompt_file):
return message, (24000, samples[0][0].cpu().numpy())


from utils.sentence_cutter import split_text_into_sentences
@torch.no_grad()
def infer_long_text(text, prompt=None, language='auto', accent='no-accent'):
"""
For long audio generation, two modes are available.
fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
"""
mode = 'fixed-prompt'
global model, audio_tokenizer, text_tokenizer, text_collater
model.to(device)
if prompt is None or prompt == "":
mode = 'sliding-window' # If no prompt is given, use sliding-window mode
sentences = split_text_into_sentences(text)
# detect language
if language == "auto-detect":
language = langid.classify(text)[0]
else:
language = token2lang[langdropdown2token[language]]

# if initial prompt is given, encode it
if prompt is not None and prompt != "":
# load prompt
prompt_data = np.load(prompt.name)
audio_prompts = prompt_data['audio_tokens']
text_prompts = prompt_data['text_tokens']
lang_pr = prompt_data['lang_code']
lang_pr = code2lang[int(lang_pr)]

# numpy to tensor
audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
text_prompts = torch.tensor(text_prompts).type(torch.int32)
else:
audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
text_prompts = torch.zeros([1, 0]).type(torch.int32)
lang_pr = language if language != 'mix' else 'en'
if mode == 'fixed-prompt':
complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
for text in sentences:
text = text.replace("\n", "").strip(" ")
if text == "":
continue
lang_token = lang2token[language]
lang = token2lang[lang_token]
text = lang_token + text + lang_token

enroll_x_lens = text_prompts.shape[-1]
logging.info(f"synthesize text: {text}")
phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
text_tokens, text_tokens_lens = text_collater(
[
phone_tokens
]
)
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
text_tokens_lens += enroll_x_lens
# accent control
lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
encoded_frames = model.inference(
text_tokens.to(device),
text_tokens_lens.to(device),
audio_prompts,
enroll_x_lens=enroll_x_lens,
top_k=-100,
temperature=1,
prompt_language=lang_pr,
text_language=langs if accent == "no-accent" else lang,
)
complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
samples = audio_tokenizer.decode(
[(complete_tokens, None)]
)
model.to('cpu')
message = f"Cut into {len(sentences)} sentences"
return message, (24000, samples[0][0].cpu().numpy())
elif mode == "sliding-window":
complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
original_audio_prompts = audio_prompts
original_text_prompts = text_prompts
for text in sentences:
text = text.replace("\n", "").strip(" ")
if text == "":
continue
lang_token = lang2token[language]
lang = token2lang[lang_token]
text = lang_token + text + lang_token

enroll_x_lens = text_prompts.shape[-1]
logging.info(f"synthesize text: {text}")
phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
text_tokens, text_tokens_lens = text_collater(
[
phone_tokens
]
)
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
text_tokens_lens += enroll_x_lens
# accent control
lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
encoded_frames = model.inference(
text_tokens.to(device),
text_tokens_lens.to(device),
audio_prompts,
enroll_x_lens=enroll_x_lens,
top_k=-100,
temperature=1,
prompt_language=lang_pr,
text_language=langs if accent == "no-accent" else lang,
)
complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
if torch.rand(1) < 1.0:
audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:]
text_prompts = text_tokens[:, enroll_x_lens:]
else:
audio_prompts = original_audio_prompts
text_prompts = original_text_prompts
samples = audio_tokenizer.decode(
[(complete_tokens, None)]
)
model.to('cpu')
message = f"Cut into {len(sentences)} sentences"
return message, (24000, samples[0][0].cpu().numpy())
else:
raise ValueError(f"No such mode {mode}")


def main():
app = gr.Blocks()
with app:
Expand Down Expand Up @@ -394,6 +520,25 @@ def main():
btn_3.click(infer_from_prompt,
inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, prompt_file],
outputs=[text_output_3, audio_output_3])
with gr.Tab("Infer long text"):
gr.Markdown("This is a long text generation demo. You can use this to generate long audio. ")
with gr.Row():
with gr.Column():
textbox_4 = gr.TextArea(label="Text",
placeholder="Type your sentence here",
value=long_text_example, elem_id=f"tts-input")
language_dropdown_4 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect',
label='language')
accent_dropdown_4 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
label='accent')
prompt_file_4 = gr.File(file_count='single', file_types=['.npz'], interactive=True)
with gr.Column():
text_output_4 = gr.TextArea(label="Message")
audio_output_4 = gr.Audio(label="Output Audio", elem_id="tts-audio")
btn_4 = gr.Button("Generate!")
btn_4.click(infer_long_text,
inputs=[textbox_4, prompt_file_4, language_dropdown_4, accent_dropdown_4],
outputs=[text_output_4, audio_output_4])

app.launch()

Expand Down
125 changes: 124 additions & 1 deletion utils/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from data.collation import get_text_token_collater
from models.vallex import VALLE
from utils.g2p import PhonemeBpeTokenizer
from utils.sentence_cutter import split_text_into_sentences

from macros import *

Expand Down Expand Up @@ -130,4 +131,126 @@ def generate_audio(text, prompt=None, language='auto', accent='no-accent'):
[(encoded_frames.transpose(2, 1), None)]
)

return samples[0][0].cpu().numpy()
return samples[0][0].cpu().numpy()

@torch.no_grad()
def generate_audio_from_long_text(text, prompt=None, language='auto', accent='no-accent', mode='sliding-window'):
"""
For long audio generation, two modes are available.
fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
"""
global model, codec, text_tokenizer, text_collater
if prompt is None or prompt == "":
mode = 'sliding-window' # If no prompt is given, use sliding-window mode
sentences = split_text_into_sentences(text)
# detect language
if language == "auto":
language = langid.classify(text)[0]

# if initial prompt is given, encode it
if prompt is not None and prompt != "":
prompt_path = prompt
if not os.path.exists(prompt_path):
prompt_path = "./presets/" + prompt + ".npz"
if not os.path.exists(prompt_path):
prompt_path = "./customs/" + prompt + ".npz"
if not os.path.exists(prompt_path):
raise ValueError(f"Cannot find prompt {prompt}")
prompt_data = np.load(prompt_path)
audio_prompts = prompt_data['audio_tokens']
text_prompts = prompt_data['text_tokens']
lang_pr = prompt_data['lang_code']
lang_pr = code2lang[int(lang_pr)]

# numpy to tensor
audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
text_prompts = torch.tensor(text_prompts).type(torch.int32)
else:
audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
text_prompts = torch.zeros([1, 0]).type(torch.int32)
lang_pr = language if language != 'mix' else 'en'
if mode == 'fixed-prompt':
complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
for text in sentences:
text = text.replace("\n", "").strip(" ")
if text == "":
continue
lang_token = lang2token[language]
lang = token2lang[lang_token]
text = lang_token + text + lang_token

enroll_x_lens = text_prompts.shape[-1]
logging.info(f"synthesize text: {text}")
phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
text_tokens, text_tokens_lens = text_collater(
[
phone_tokens
]
)
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
text_tokens_lens += enroll_x_lens
# accent control
lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
encoded_frames = model.inference(
text_tokens.to(device),
text_tokens_lens.to(device),
audio_prompts,
enroll_x_lens=enroll_x_lens,
top_k=-100,
temperature=1,
prompt_language=lang_pr,
text_language=langs if accent == "no-accent" else lang,
)
complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
samples = codec.decode(
[(complete_tokens, None)]
)
return samples[0][0].cpu().numpy()
elif mode == "sliding-window":
complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
original_audio_prompts = audio_prompts
original_text_prompts = text_prompts
for text in sentences:
text = text.replace("\n", "").strip(" ")
if text == "":
continue
lang_token = lang2token[language]
lang = token2lang[lang_token]
text = lang_token + text + lang_token

enroll_x_lens = text_prompts.shape[-1]
logging.info(f"synthesize text: {text}")
phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
text_tokens, text_tokens_lens = text_collater(
[
phone_tokens
]
)
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
text_tokens_lens += enroll_x_lens
# accent control
lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
encoded_frames = model.inference(
text_tokens.to(device),
text_tokens_lens.to(device),
audio_prompts,
enroll_x_lens=enroll_x_lens,
top_k=-100,
temperature=1,
prompt_language=lang_pr,
text_language=langs if accent == "no-accent" else lang,
)
complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
if torch.rand(1) < 0.5:
audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:]
text_prompts = text_tokens[:, enroll_x_lens:]
else:
audio_prompts = original_audio_prompts
text_prompts = original_text_prompts
samples = codec.decode(
[(complete_tokens, None)]
)
return samples[0][0].cpu().numpy()
else:
raise ValueError(f"No such mode {mode}")
54 changes: 54 additions & 0 deletions utils/sentence_cutter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import nltk
import jieba
import sudachipy
import langid
langid.set_languages(['en', 'zh', 'ja'])

def split_text_into_sentences(text):
if langid.classify(text)[0] == "en":
sentences = nltk.tokenize.sent_tokenize(text)

return sentences
elif langid.classify(text)[0] == "zh":
sentences = []
segs = jieba.cut(text, cut_all=False)
segs = list(segs)
start = 0
for i, seg in enumerate(segs):
if seg in ["。", "!", "?", "……"]:
sentences.append("".join(segs[start:i + 1]))
start = i + 1
if start < len(segs):
sentences.append("".join(segs[start:]))

return sentences
elif langid.classify(text)[0] == "ja":
sentences = []
tokenizer = sudachipy.Dictionary().create()
tokens = tokenizer.tokenize(text)
current_sentence = ""

for token in tokens:
current_sentence += token.surface()
if token.part_of_speech()[0] == "補助記号" and token.part_of_speech()[1] == "句点":
sentences.append(current_sentence)
current_sentence = ""

if current_sentence:
sentences.append(current_sentence)

return sentences

raise RuntimeError("It is impossible to reach here.")

long_text = """
This is a very long paragraph, so most TTS model is unable to handle it. Hence, we have to split it into several sentences. With the help of NLTK, we can split it into sentences. However, the punctuation is not preserved, so we have to add it back. How are we going to do write this code? Let's see.
"""

long_text = """
现在我们要来尝试一下中文分句。因为很不幸的是,NLTK不支持中文分句。幸运的是,我们可以使用jieba来分句。但是,jieba分句后,标点符号会丢失,所以我们要手动添加回去。我现在正在想办法把这个例句写的更长更复杂一点,来测试jieba分句的性能。嗯......省略号,感觉不太好,因为省略号不是句号,所以jieba不会把它当作句子的结尾。会这样吗?我们来试试看。
"""

long_text = """
これなら、英語と中国語の分句もできる。でも、日本語はどうする?まつわ、ChatGPTに僕と教えてください。ちょーと待ってください。あ、出来た!
"""

0 comments on commit 69bf5b4

Please sign in to comment.