-
Notifications
You must be signed in to change notification settings - Fork 0
/
tts_test.py
128 lines (107 loc) · 4.87 KB
/
tts_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import gradio as gr
import torch
import argparse
from time import sleep
from datetime import datetime
from realtime_chatbot.utils import audio_helpers
from realtime_chatbot.tts_handler import TTSHandlerMultiprocessing, TTSConfig
from realtime_chatbot.speech_enhancer import SpeechEnhancer
tts_engine = None
tts_handler = None
speech_enhancer = None
def _synthesize_handler(text, buffer_size):
text_parts = text.split("|")
audio_parts = []
for i, part in enumerate(text_parts):
wait_secs = 3 if part.startswith("~") or part.startswith("*") else 60
tts_handler.queue_input(part)
if (i+1) % buffer_size == 0 or i == len(text_parts)-1:
if i == len(text_parts)-1:
while (i+1) % buffer_size > 0:
tts_handler.queue_input("")
i += 1
start = datetime.now()
output = None
while output is None:
output = tts_handler.next_output()
if (datetime.now()-start).total_seconds() > wait_secs:
break
sleep(0.001)
if output is not None:
audio_parts.append(output)
return audio_parts
def process_text(text, buffer_size, voice, downsample_factor, duration_factor, pitch_factor, energy_factor):
tts_handler.queue_config(TTSConfig(tts_engine=tts_engine, buffer_size=buffer_size, speaker=voice,
duration_factor=duration_factor, pitch_factor=pitch_factor, energy_factor=energy_factor))
audio_parts = _synthesize_handler(text, buffer_size)
if not audio_parts:
return None, None, None, None, None
wav_tensor, sr = audio_helpers.concat_audios_to_tensor(audio_parts)
sr_downsample = sr // downsample_factor
wav_downsample, _ = audio_helpers.downsample(wav_tensor, sr, sr_downsample)
wav_downsample = wav_downsample.numpy()
wav_denoised, sr_denoised = speech_enhancer.enhance("denoiser", wav_tensor, sr)
wav_denoised = wav_denoised.cpu().numpy()
wav_noisereduced, sr_noisereduced = speech_enhancer.enhance("noisereduce-nonstationary", wav_tensor, sr)
wav_noisereduced = wav_noisereduced.cpu().numpy()
wav_sources, sr_sources = speech_enhancer.enhance("sepformer-whamr-enhancement", wav_tensor, sr)
wav_sources = wav_sources.cpu().numpy()
wav_enhanced_mtl, sr_enhanced_mtl = speech_enhancer.enhance("mtl-mimic-voicebank", wav_tensor, sr)
wav_enhanced_mtl = wav_enhanced_mtl.cpu().numpy()
wav_enhanced_metricgan, sr_enhanced_metricgan = speech_enhancer.enhance("metricgan-plus-voicebank", wav_tensor, sr)
wav_enhanced_metricgan = wav_enhanced_metricgan.cpu().numpy()
wav_enhanced_frcrn, sr_enhanced_frcrn = speech_enhancer.enhance("damo/speech_frcrn_ans_cirm_16k", wav_tensor, sr)
wav_enhanced_frcrn = wav_enhanced_frcrn.cpu().numpy()
return (sr, wav_tensor.cpu().numpy()), \
(sr_downsample, wav_downsample), \
(sr_denoised, wav_denoised), \
(sr_noisereduced, wav_noisereduced), \
(sr_sources, wav_sources), \
(sr_enhanced_mtl, wav_enhanced_mtl), \
(sr_enhanced_metricgan, wav_enhanced_metricgan), \
(sr_enhanced_frcrn, wav_enhanced_frcrn)
if __name__ == "__main__":
parser = argparse.ArgumentParser("TTS Test")
parser.add_argument("--tts-engine", type=str, default="fastspeech2", help="TTS engine to use")
args = parser.parse_args()
print("\nRunning with arguments:")
print(args)
print()
tts_engine = args.tts_engine
device = torch.device("cuda")
tts_handler = TTSHandlerMultiprocessing(
device=device,
config=TTSConfig(tts_engine=tts_engine),
wait_until_running=True
)
speech_enhancer = SpeechEnhancer(device=device)
interface = gr.Interface(
fn=process_text,
inputs=[
"text",
gr.Slider(1, 5, value=1, step=1),
gr.Dropdown(
choices=tts_handler.available_speakers,
value=tts_handler.available_speakers[0],
label="Voice"
),
gr.Slider(1, 6, value=1, step=1),
gr.Slider(-5, 5, value=1, step=0.1),
gr.Slider(-5, 5, value=1, step=0.1),
gr.Slider(-5, 5, value=1, step=0.1)
],
outputs=[
gr.Audio(label="Control"),
gr.Audio(label="Control (downsampled)"),
gr.Audio(label="Experimental (denoiser)"),
gr.Audio(label="Experimental (noisereduce nonstationary)"),
gr.Audio(label="Experimental (sepformer WHAMR! separation)"),
gr.Audio(label="Experimental (Spectral Feature Mapping with mimic)"),
gr.Audio(label="Experimental (MetricGAN+)"),
gr.Audio(label="Experimental (FRCRN)")
],
allow_flagging='never',
title="TTS Test",
description="TTS Test"
)
interface.launch()