Merge branch 'master' into frontend_dev

ZackHodari · Oct 31, 2019 · e01e865 · e01e865
2 parents 020e62d + 4485d11
commit e01e865
Show file tree

Hide file tree

Showing 9 changed files with 205 additions and 66 deletions.
diff --git a/setup.py b/setup.py
@@ -11,9 +11,10 @@
     # license='MIT',
     install_requires=[
         'numpy',
-        'scipy',
-        'pyworld',
         'pyreaper',
+        'pysptk',
+        'pyworld',
+        'scipy',
         'tqdm'
     ],
     packages=['tts_data_tools'],

diff --git a/tts_data_tools/data_sources.py b/tts_data_tools/data_sources.py
@@ -293,10 +293,17 @@ class NumpyBinarySource(_DataSource):
         Whether to compute delta features.
     ext : str, optional
         The file extension of the saved features, if not set `name` is used.
+    dtype : np.dtype, optional
+        If given, convert the data to this dtype when saving/loading.
+    sentence_level : bool, optional
+        If True, try and convert the loaded data to a scalar.
     """
-    def __init__(self, name, use_deltas=False, ext='npy'):
+    def __init__(self, name, use_deltas=False, ext='npy', dtype=None, sentence_level=False):
         super(NumpyBinarySource, self).__init__(name, use_deltas, ext)
 
+        self.dtype = dtype
+        self.sentence_level = sentence_level
+
     def load_file(self, base_name, data_dir):
         r"""Loads the feature using `np.load`.
 
@@ -312,7 +319,12 @@ def load_file(self, base_name, data_dir):
         int or float or bool or np.ndarray, shape (seq_len, feat_dim)
         """
         file_path = self.file_path(base_name, data_dir)
-        return file_io.load_bin(file_path)
+        data = file_io.load_bin(file_path)
+
+        if self.dtype is not None:
+            data = data.astype(self.dtype)
+
+        return data
 
     def save_file(self, data, base_name, data_dir):
         r"""Saves the feature using `np.save`.
@@ -327,6 +339,14 @@ def save_file(self, data, base_name, data_dir):
             The directory containing all feature types for this dataset.
         """
         file_path = self.file_path(base_name, data_dir)
+
+        if self.dtype is not None:
+            data = data.astype(self.dtype)
+
+        # If the sequence length feature is describing a sentence level length, convert it to a scalar.
+        if data.shape[0] == 1 and self.sentence_level:
+            data = data.item()
+
         file_io.save_bin(data, file_path)
 
 
@@ -341,10 +361,14 @@ class TextSource(_DataSource):
         Whether to compute delta features.
     ext : str, optional
         The file extension of the saved features, if not set `name` is used.
+    sentence_level : bool, optional
+        If True, try and convert the loaded data to a scalar.
     """
-    def __init__(self, name, use_deltas=False, ext='txt'):
+    def __init__(self, name, use_deltas=False, ext='txt', sentence_level=False):
         super(TextSource, self).__init__(name, use_deltas, ext)
 
+        self.sentence_level = sentence_level
+
     def load_file(self, base_name, data_dir):
         r"""Loads the feature from a text file into a numpy array.
 
@@ -360,13 +384,13 @@ def load_file(self, base_name, data_dir):
         int or float or np.ndarray, shape (seq_len, feat_dim)
         """
         file_path = self.file_path(base_name, data_dir)
-        feature = file_io.load_txt(file_path)
+        data = file_io.load_txt(file_path)
 
         # If the sequence length feature is describing a sentence level length, convert it to a scalar.
-        if feature.shape[0] == 1:
-            feature = feature.item()
+        if data.shape[0] == 1 and self.sentence_level:
+            data = data.item()
 
-        return feature
+        return data
 
     def save_file(self, data, base_name, data_dir):
         r"""Saves data as a text file.

diff --git a/tts_data_tools/file_io.py b/tts_data_tools/file_io.py
@@ -8,8 +8,8 @@
 
 import json
 import os
-from scipy.io import wavfile
 
+from scipy.io import wavfile
 import numpy as np
 
 from tts_data_tools import utils

diff --git a/tts_data_tools/scripts/process_dataset.py b/tts_data_tools/scripts/process_dataset.py
@@ -84,8 +84,8 @@ def process(lab_dir, wav_dir, id_list, out_dir,
     utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)
     utils.make_dirs(os.path.join(out_dir, 'lf0'), file_ids)
     utils.make_dirs(os.path.join(out_dir, 'vuv'), file_ids)
-    utils.make_dirs(os.path.join(out_dir, 'sp'), file_ids)
-    utils.make_dirs(os.path.join(out_dir, 'ap'), file_ids)
+    utils.make_dirs(os.path.join(out_dir, 'mcep'), file_ids)
+    utils.make_dirs(os.path.join(out_dir, 'bap'), file_ids)
 
     for file_id in tqdm(file_ids):
         # Label processing.
@@ -104,7 +104,7 @@ def process(lab_dir, wav_dir, id_list, out_dir,
         wav_path = os.path.join(wav_dir, '{}.wav'.format(file_id))
         wav, sample_rate = file_io.load_wav(wav_path)
 
-        f0, vuv, sp, ap = world_with_reaper_f0.analysis(wav, sample_rate)
+        f0, vuv, mcep, bap = world_with_reaper_f0.analysis(wav, sample_rate)
         lf0 = np.log(f0)
 
         # Match the number of frames between label forced-alignment and vocoder analysis.
@@ -122,7 +122,7 @@ def process(lab_dir, wav_dir, id_list, out_dir,
             # Remove 1 frame from each phone's duration starting at the end of the sequence.
             durations[-diff:] -= 1
             n_frames = f0.shape[0]
-            print("Cropped {} frames from durations and  for utterance {}".format(diff, file_id))
+            print("Cropped {} frames from durations for utterance {}".format(diff, file_id))
 
         assert n_frames == np.sum(durations).item()
 
@@ -131,10 +131,10 @@ def process(lab_dir, wav_dir, id_list, out_dir,
 
             start_phone_idx, end_phone_idx = 0, n_phones
             start_frame_idx, end_frame_idx = 0, n_frames
-            if phones[0] == 'sil':
+            if phones[0] in ['sil', '#']:
                 start_phone_idx += 1
                 start_frame_idx += durations[0]
-            if phones[-1] == 'sil':
+            if phones[-1] in ['sil', '#']:
                 end_phone_idx -= 1
                 end_frame_idx -= durations[-1]
 
@@ -151,30 +151,30 @@ def process(lab_dir, wav_dir, id_list, out_dir,
         counter_features = counter_features[trim_frame_slice]
         lf0 = lf0[trim_frame_slice]
         vuv = vuv[trim_frame_slice]
-        sp = sp[trim_frame_slice]
-        ap = ap[trim_frame_slice]
+        mcep = mcep[trim_frame_slice]
+        bap = bap[trim_frame_slice]
 
-        file_io.save_bin(numerical_labels, os.path.join(out_dir, 'lab', file_id))
-        file_io.save_bin(counter_features, os.path.join(out_dir, 'counters', file_id))
+        file_io.save_bin(numerical_labels.astype(np.float32), os.path.join(out_dir, 'lab', file_id))
+        file_io.save_bin(counter_features.astype(np.float32), os.path.join(out_dir, 'counters', file_id))
         file_io.save_txt(durations, os.path.join(out_dir, 'dur', '{}.txt'.format(file_id)))
         file_io.save_lines(phones, os.path.join(out_dir, 'phones', '{}.txt'.format(file_id)))
 
         file_io.save_txt(n_frames, os.path.join(out_dir, 'n_frames', '{}.txt'.format(file_id)))
         file_io.save_txt(n_phones, os.path.join(out_dir, 'n_phones', '{}.txt'.format(file_id)))
 
-        file_io.save_bin(lf0, os.path.join(out_dir, 'lf0', file_id))
+        file_io.save_bin(lf0.astype(np.float32), os.path.join(out_dir, 'lf0', file_id))
         file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', file_id))
-        file_io.save_bin(sp, os.path.join(out_dir, 'sp', file_id))
-        file_io.save_bin(ap, os.path.join(out_dir, 'ap', file_id))
+        file_io.save_bin(mcep.astype(np.float32), os.path.join(out_dir, 'mcep', file_id))
+        file_io.save_bin(bap.astype(np.float32), os.path.join(out_dir, 'bap', file_id))
 
     if calculate_normalisation:
         process_minmax(out_dir, 'lab', id_list, out_dir=out_dir)
         process_minmax(out_dir, 'counters', id_list, out_dir=out_dir)
         process_mvn(out_dir, 'dur', is_npy=False, id_list=id_list, deltas=False, out_dir=out_dir)
 
         process_mvn(out_dir, 'lf0', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)
-        process_mvn(out_dir, 'sp', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)
-        process_mvn(out_dir, 'ap', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)
+        process_mvn(out_dir, 'mcep', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)
+        process_mvn(out_dir, 'bap', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)
 
 
 def main():

diff --git a/tts_data_tools/scripts/process_phones.py b/tts_data_tools/scripts/process_phones.py
@@ -38,6 +38,9 @@ def process(lab_dir, id_list, out_dir, state_level):
     """
     file_ids = utils.get_file_ids(id_list=id_list)
 
+    utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids)
+    utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)
+
     for file_id in file_ids:
         # Label processing.
         lab_path = os.path.join(lab_dir, '{}.lab'.format(file_id))

diff --git a/tts_data_tools/wav_gen/reaper_f0.py b/tts_data_tools/wav_gen/reaper_f0.py
@@ -10,7 +10,7 @@
 
 import pyreaper
 
-REAPER_UNVOICED_VALUE = -1.
+UNVOICED_VALUE = -1.
 
 
 def add_arguments(parser):
@@ -27,7 +27,7 @@ def add_arguments(parser):
 
 
 def extract_vuv(f0):
-    return utils.extract_vuv(f0, REAPER_UNVOICED_VALUE)
+    return utils.extract_vuv(f0, UNVOICED_VALUE)
 
 
 def basic_analysis(wav, sample_rate):

diff --git a/tts_data_tools/wav_gen/utils.py b/tts_data_tools/wav_gen/utils.py
@@ -1,9 +1,65 @@
 import numpy as np
 from scipy.signal import convolve2d
 
+FRAME_LENGTH = {
+    8000: 512,
+    10000: 512,
+    16000: 1024,
+    22050: 1024,
+    24000: 1024,
+    44100: 2048,
+    48000: 2048,
+}
+
+r"""
+Alpha is used to approximate the effect of the mel-scale filter bank, the choice of alpha is dependent on the sampling
+rate. The following code can be used to manually determine a good value of alpha.
+
+See https://www.sp.nitech.ac.jp/~tokuda/tokuda_tamkang2002.pdf for more details.
+
+```
+def plot_warping_alpha_or_mel(alpha, sample_rate, frame_length=1024):
+    nfft_half = frame_length // 2 + 1
+
+    hz = np.linspace(0, sample_rate / 2., nfft_half)
+    mel = 1127. * np.log(1. + (hz / 700.))
+    mel = mel / mel.max() * np.pi
+
+    omega = np.linspace(0, np.pi, nfft_half)
+    H = (np.exp(-1j * omega) - alpha) / (1 - alpha * np.exp(-1j * omega))
+    warped_omega = -np.arctan2(np.imag(H), np.real(H))
+
+    plt.figure(figsize=(6, 6))
+    plt.plot(omega, mel, label=f'mel (sr={sample_rate})')
+    plt.plot(omega, warped_omega, label=f'alpha={alpha}')
+    plt.legend(loc='lower right')
+    plt.xticks([0, np.pi/2., np.pi], [r'$0$', r'$\frac{\pi}{2}$', r'$\pi$'])
+    plt.yticks([0, np.pi/2., np.pi], [r'$0$', r'$\frac{\pi}{2}$', r'$\pi$'])
+    plt.show()
+
+plot_warping_alpha_or_mel(0.36, 8000)
+plot_warping_alpha_or_mel(0.39, 10000)
+plot_warping_alpha_or_mel(0.46, 16000)
+plot_warping_alpha_or_mel(0.50, 22050)
+plot_warping_alpha_or_mel(0.51, 24000)
+plot_warping_alpha_or_mel(0.58, 44100)
+plot_warping_alpha_or_mel(0.60, 48000)
+```
+"""
+
+ALPHA = {
+    8000: 0.36,
+    10000: 0.39,
+    16000: 0.46,
+    22050: 0.50,
+    24000: 0.51,
+    44100: 0.58,
+    48000: 0.60,
+}
+
 
 def compute_running_window(feature, window):
-    """Computing dynamic features using a window is exactly a convolution operation."""
+    r"""Computing dynamic features using a window is exactly a convolution operation."""
     # Check that the window length is odd.
     assert len(window) % 2 == 1
 
@@ -35,7 +91,7 @@ def extract_vuv(signal, unvoiced_value):
 
 
 def interpolate(signal, is_voiced):
-    """Linearly interpolates the signal in unvoiced regions such that there are no discontinuities.
+    r"""Linearly interpolates the signal in unvoiced regions such that there are no discontinuities.
 
     Args:
         signal (np.ndarray[n_frames, feat_dim]): Temporal signal.