forked from PlayVoice/whisper-vits-svc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_wave.py
116 lines (101 loc) · 4.03 KB
/
preprocess_wave.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import librosa
import pyworld
import utils
import numpy as np
from scipy.io import wavfile
class FeatureInput(object):
def __init__(self, samplerate=16000, hop_size=160):
self.fs = samplerate
self.hop = hop_size
self.f0_bin = 256
self.f0_max = 1100.0
self.f0_min = 50.0
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
def compute_f0(self, path):
x, sr = librosa.load(path, sr=self.fs)
assert sr == self.fs
f0, t = pyworld.dio(
x.astype(np.double),
fs=sr,
f0_ceil=800,
frame_period=1000 * self.hop / sr,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1)
return f0
# for numpy # code from diffsinger
def coarse_f0(self, f0):
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
self.f0_bin - 2
) / (self.f0_mel_max - self.f0_mel_min) + 1
# use 0 or 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
f0_coarse = np.rint(f0_mel).astype(np.int)
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
f0_coarse.max(),
f0_coarse.min(),
)
return f0_coarse
# for tensor # code from diffsinger
def coarse_f0_ts(self, f0):
f0_mel = 1127 * (1 + f0 / 700).log()
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
self.f0_bin - 2
) / (self.f0_mel_max - self.f0_mel_min) + 1
# use 0 or 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
f0_coarse = (f0_mel + 0.5).long()
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
f0_coarse.max(),
f0_coarse.min(),
)
return f0_coarse
def save_wav(self, wav, path):
wav *= 32767 / max(0.01, np.max(np.abs(wav))) * 0.6
wavfile.write(path, self.fs, wav.astype(np.int16))
if __name__ == "__main__":
wavPath = "./data/waves"
outPath = "./data/label"
if not os.path.exists("./data/label"):
os.mkdir("./data/label")
# define model and load checkpoint
hps = utils.get_hparams_from_file("./configs/singing_base.json")
featureInput = FeatureInput(hps.data.sampling_rate, hps.data.hop_length)
vits_file = open("./filelists/vc_file.txt", "w", encoding="utf-8")
for spks in os.listdir(wavPath):
if os.path.isdir(f"./{wavPath}/{spks}"):
os.makedirs(f"./{outPath}/{spks}")
for file in os.listdir(f"./{wavPath}/{spks}"):
if file.endswith(".wav"):
file = file[:-4]
audio_path = f"./{wavPath}/{spks}/{file}.wav"
featur_pit = featureInput.compute_f0(audio_path)
coarse_pit = featureInput.coarse_f0(featur_pit)
np.save(
f"{outPath}/{spks}/{file}_pitch.npy",
coarse_pit,
allow_pickle=False,
)
np.save(
f"{outPath}/{spks}/{file}_nsff0.npy",
featur_pit,
allow_pickle=False,
)
path_audio = f"./data/waves/{spks}/{file}.wav"
path_spkid = f"./data/spkid/{spks}.npy"
path_label = (
f"./data/phone/{spks}/{file}.npy" # phone means ppg & hubert
)
path_pitch = f"./data/label/{spks}/{file}_pitch.npy"
path_nsff0 = f"./data/label/{spks}/{file}_nsff0.npy"
print(
f"{path_audio}|{path_spkid}|{path_label}|{path_pitch}|{path_nsff0}",
file=vits_file,
)
vits_file.close()