Skip to content

Commit

Permalink
Merge branch 'master' into frontend_dev
Browse files Browse the repository at this point in the history
  • Loading branch information
ZackHodari committed Oct 31, 2019
2 parents 020e62d + 4485d11 commit e01e865
Show file tree
Hide file tree
Showing 9 changed files with 205 additions and 66 deletions.
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@
# license='MIT',
install_requires=[
'numpy',
'scipy',
'pyworld',
'pyreaper',
'pysptk',
'pyworld',
'scipy',
'tqdm'
],
packages=['tts_data_tools'],
Expand Down
38 changes: 31 additions & 7 deletions tts_data_tools/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,10 +293,17 @@ class NumpyBinarySource(_DataSource):
Whether to compute delta features.
ext : str, optional
The file extension of the saved features, if not set `name` is used.
dtype : np.dtype, optional
If given, convert the data to this dtype when saving/loading.
sentence_level : bool, optional
If True, try and convert the loaded data to a scalar.
"""
def __init__(self, name, use_deltas=False, ext='npy'):
def __init__(self, name, use_deltas=False, ext='npy', dtype=None, sentence_level=False):
super(NumpyBinarySource, self).__init__(name, use_deltas, ext)

self.dtype = dtype
self.sentence_level = sentence_level

def load_file(self, base_name, data_dir):
r"""Loads the feature using `np.load`.
Expand All @@ -312,7 +319,12 @@ def load_file(self, base_name, data_dir):
int or float or bool or np.ndarray, shape (seq_len, feat_dim)
"""
file_path = self.file_path(base_name, data_dir)
return file_io.load_bin(file_path)
data = file_io.load_bin(file_path)

if self.dtype is not None:
data = data.astype(self.dtype)

return data

def save_file(self, data, base_name, data_dir):
r"""Saves the feature using `np.save`.
Expand All @@ -327,6 +339,14 @@ def save_file(self, data, base_name, data_dir):
The directory containing all feature types for this dataset.
"""
file_path = self.file_path(base_name, data_dir)

if self.dtype is not None:
data = data.astype(self.dtype)

# If the sequence length feature is describing a sentence level length, convert it to a scalar.
if data.shape[0] == 1 and self.sentence_level:
data = data.item()

file_io.save_bin(data, file_path)


Expand All @@ -341,10 +361,14 @@ class TextSource(_DataSource):
Whether to compute delta features.
ext : str, optional
The file extension of the saved features, if not set `name` is used.
sentence_level : bool, optional
If True, try and convert the loaded data to a scalar.
"""
def __init__(self, name, use_deltas=False, ext='txt'):
def __init__(self, name, use_deltas=False, ext='txt', sentence_level=False):
super(TextSource, self).__init__(name, use_deltas, ext)

self.sentence_level = sentence_level

def load_file(self, base_name, data_dir):
r"""Loads the feature from a text file into a numpy array.
Expand All @@ -360,13 +384,13 @@ def load_file(self, base_name, data_dir):
int or float or np.ndarray, shape (seq_len, feat_dim)
"""
file_path = self.file_path(base_name, data_dir)
feature = file_io.load_txt(file_path)
data = file_io.load_txt(file_path)

# If the sequence length feature is describing a sentence level length, convert it to a scalar.
if feature.shape[0] == 1:
feature = feature.item()
if data.shape[0] == 1 and self.sentence_level:
data = data.item()

return feature
return data

def save_file(self, data, base_name, data_dir):
r"""Saves data as a text file.
Expand Down
2 changes: 1 addition & 1 deletion tts_data_tools/file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

import json
import os
from scipy.io import wavfile

from scipy.io import wavfile
import numpy as np

from tts_data_tools import utils
Expand Down
30 changes: 15 additions & 15 deletions tts_data_tools/scripts/process_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ def process(lab_dir, wav_dir, id_list, out_dir,
utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)
utils.make_dirs(os.path.join(out_dir, 'lf0'), file_ids)
utils.make_dirs(os.path.join(out_dir, 'vuv'), file_ids)
utils.make_dirs(os.path.join(out_dir, 'sp'), file_ids)
utils.make_dirs(os.path.join(out_dir, 'ap'), file_ids)
utils.make_dirs(os.path.join(out_dir, 'mcep'), file_ids)
utils.make_dirs(os.path.join(out_dir, 'bap'), file_ids)

for file_id in tqdm(file_ids):
# Label processing.
Expand All @@ -104,7 +104,7 @@ def process(lab_dir, wav_dir, id_list, out_dir,
wav_path = os.path.join(wav_dir, '{}.wav'.format(file_id))
wav, sample_rate = file_io.load_wav(wav_path)

f0, vuv, sp, ap = world_with_reaper_f0.analysis(wav, sample_rate)
f0, vuv, mcep, bap = world_with_reaper_f0.analysis(wav, sample_rate)
lf0 = np.log(f0)

# Match the number of frames between label forced-alignment and vocoder analysis.
Expand All @@ -122,7 +122,7 @@ def process(lab_dir, wav_dir, id_list, out_dir,
# Remove 1 frame from each phone's duration starting at the end of the sequence.
durations[-diff:] -= 1
n_frames = f0.shape[0]
print("Cropped {} frames from durations and for utterance {}".format(diff, file_id))
print("Cropped {} frames from durations for utterance {}".format(diff, file_id))

assert n_frames == np.sum(durations).item()

Expand All @@ -131,10 +131,10 @@ def process(lab_dir, wav_dir, id_list, out_dir,

start_phone_idx, end_phone_idx = 0, n_phones
start_frame_idx, end_frame_idx = 0, n_frames
if phones[0] == 'sil':
if phones[0] in ['sil', '#']:
start_phone_idx += 1
start_frame_idx += durations[0]
if phones[-1] == 'sil':
if phones[-1] in ['sil', '#']:
end_phone_idx -= 1
end_frame_idx -= durations[-1]

Expand All @@ -151,30 +151,30 @@ def process(lab_dir, wav_dir, id_list, out_dir,
counter_features = counter_features[trim_frame_slice]
lf0 = lf0[trim_frame_slice]
vuv = vuv[trim_frame_slice]
sp = sp[trim_frame_slice]
ap = ap[trim_frame_slice]
mcep = mcep[trim_frame_slice]
bap = bap[trim_frame_slice]

file_io.save_bin(numerical_labels, os.path.join(out_dir, 'lab', file_id))
file_io.save_bin(counter_features, os.path.join(out_dir, 'counters', file_id))
file_io.save_bin(numerical_labels.astype(np.float32), os.path.join(out_dir, 'lab', file_id))
file_io.save_bin(counter_features.astype(np.float32), os.path.join(out_dir, 'counters', file_id))
file_io.save_txt(durations, os.path.join(out_dir, 'dur', '{}.txt'.format(file_id)))
file_io.save_lines(phones, os.path.join(out_dir, 'phones', '{}.txt'.format(file_id)))

file_io.save_txt(n_frames, os.path.join(out_dir, 'n_frames', '{}.txt'.format(file_id)))
file_io.save_txt(n_phones, os.path.join(out_dir, 'n_phones', '{}.txt'.format(file_id)))

file_io.save_bin(lf0, os.path.join(out_dir, 'lf0', file_id))
file_io.save_bin(lf0.astype(np.float32), os.path.join(out_dir, 'lf0', file_id))
file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', file_id))
file_io.save_bin(sp, os.path.join(out_dir, 'sp', file_id))
file_io.save_bin(ap, os.path.join(out_dir, 'ap', file_id))
file_io.save_bin(mcep.astype(np.float32), os.path.join(out_dir, 'mcep', file_id))
file_io.save_bin(bap.astype(np.float32), os.path.join(out_dir, 'bap', file_id))

if calculate_normalisation:
process_minmax(out_dir, 'lab', id_list, out_dir=out_dir)
process_minmax(out_dir, 'counters', id_list, out_dir=out_dir)
process_mvn(out_dir, 'dur', is_npy=False, id_list=id_list, deltas=False, out_dir=out_dir)

process_mvn(out_dir, 'lf0', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)
process_mvn(out_dir, 'sp', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)
process_mvn(out_dir, 'ap', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)
process_mvn(out_dir, 'mcep', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)
process_mvn(out_dir, 'bap', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)


def main():
Expand Down
3 changes: 3 additions & 0 deletions tts_data_tools/scripts/process_phones.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ def process(lab_dir, id_list, out_dir, state_level):
"""
file_ids = utils.get_file_ids(id_list=id_list)

utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids)
utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)

for file_id in file_ids:
# Label processing.
lab_path = os.path.join(lab_dir, '{}.lab'.format(file_id))
Expand Down
4 changes: 2 additions & 2 deletions tts_data_tools/wav_gen/reaper_f0.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import pyreaper

REAPER_UNVOICED_VALUE = -1.
UNVOICED_VALUE = -1.


def add_arguments(parser):
Expand All @@ -27,7 +27,7 @@ def add_arguments(parser):


def extract_vuv(f0):
return utils.extract_vuv(f0, REAPER_UNVOICED_VALUE)
return utils.extract_vuv(f0, UNVOICED_VALUE)


def basic_analysis(wav, sample_rate):
Expand Down
60 changes: 58 additions & 2 deletions tts_data_tools/wav_gen/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,65 @@
import numpy as np
from scipy.signal import convolve2d

FRAME_LENGTH = {
8000: 512,
10000: 512,
16000: 1024,
22050: 1024,
24000: 1024,
44100: 2048,
48000: 2048,
}

r"""
Alpha is used to approximate the effect of the mel-scale filter bank, the choice of alpha is dependent on the sampling
rate. The following code can be used to manually determine a good value of alpha.
See https://www.sp.nitech.ac.jp/~tokuda/tokuda_tamkang2002.pdf for more details.
```
def plot_warping_alpha_or_mel(alpha, sample_rate, frame_length=1024):
nfft_half = frame_length // 2 + 1
hz = np.linspace(0, sample_rate / 2., nfft_half)
mel = 1127. * np.log(1. + (hz / 700.))
mel = mel / mel.max() * np.pi
omega = np.linspace(0, np.pi, nfft_half)
H = (np.exp(-1j * omega) - alpha) / (1 - alpha * np.exp(-1j * omega))
warped_omega = -np.arctan2(np.imag(H), np.real(H))
plt.figure(figsize=(6, 6))
plt.plot(omega, mel, label=f'mel (sr={sample_rate})')
plt.plot(omega, warped_omega, label=f'alpha={alpha}')
plt.legend(loc='lower right')
plt.xticks([0, np.pi/2., np.pi], [r'$0$', r'$\frac{\pi}{2}$', r'$\pi$'])
plt.yticks([0, np.pi/2., np.pi], [r'$0$', r'$\frac{\pi}{2}$', r'$\pi$'])
plt.show()
plot_warping_alpha_or_mel(0.36, 8000)
plot_warping_alpha_or_mel(0.39, 10000)
plot_warping_alpha_or_mel(0.46, 16000)
plot_warping_alpha_or_mel(0.50, 22050)
plot_warping_alpha_or_mel(0.51, 24000)
plot_warping_alpha_or_mel(0.58, 44100)
plot_warping_alpha_or_mel(0.60, 48000)
```
"""

ALPHA = {
8000: 0.36,
10000: 0.39,
16000: 0.46,
22050: 0.50,
24000: 0.51,
44100: 0.58,
48000: 0.60,
}


def compute_running_window(feature, window):
"""Computing dynamic features using a window is exactly a convolution operation."""
r"""Computing dynamic features using a window is exactly a convolution operation."""
# Check that the window length is odd.
assert len(window) % 2 == 1

Expand Down Expand Up @@ -35,7 +91,7 @@ def extract_vuv(signal, unvoiced_value):


def interpolate(signal, is_voiced):
"""Linearly interpolates the signal in unvoiced regions such that there are no discontinuities.
r"""Linearly interpolates the signal in unvoiced regions such that there are no discontinuities.
Args:
signal (np.ndarray[n_frames, feat_dim]): Temporal signal.
Expand Down
Loading

0 comments on commit e01e865

Please sign in to comment.