_synthesis_impl() 前処理の関数化 (#784)

Co-authored-by: Hiroshiba <[email protected]>
VOICEVOX · Nov 28, 2023 · cb33ffa · cb33ffa
1 parent bad1209
commit cb33ffa
Show file tree

Hide file tree

Showing 2 changed files with 168 additions and 63 deletions.
diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py
@@ -1,7 +1,7 @@
 import math
 from copy import deepcopy
 from random import random
-from typing import Union
+from typing import Optional, Union
 from unittest import TestCase
 from unittest.mock import Mock
 
@@ -13,6 +13,7 @@
 
 # TODO: import from voicevox_engine.synthesis_engine.mora
 from voicevox_engine.synthesis_engine.synthesis_engine import (
+    generate_frame_scale_features,
     mora_phoneme_list,
     pre_process,
     split_mora,
@@ -96,6 +97,84 @@ def is_model_loaded(self, style_id):
         return True
 
 
+def _gen_mora(
+    text: str,
+    consonant: Optional[str],
+    consonant_length: Optional[float],
+    vowel: str,
+    vowel_length: float,
+    pitch: float,
+) -> Mora:
+    return Mora(
+        text=text,
+        consonant=consonant,
+        consonant_length=consonant_length,
+        vowel=vowel,
+        vowel_length=vowel_length,
+        pitch=pitch,
+    )
+
+
+def test_generate_frame_scale_features():
+    """Test `generate_frame_scale_features`."""
+    # Inputs
+    query = AudioQuery(
+        accent_phrases=[],
+        speedScale=2.0,
+        pitchScale=2.0,
+        intonationScale=0.5,
+        prePhonemeLength=2 * 0.01067,  # 0.01067 [sec/frame]
+        postPhonemeLength=6 * 0.01067,
+        volumeScale=0.0,
+        outputSamplingRate=0,
+        outputStereo=False,
+    )
+    flatten_moras = [
+        _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
+        _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
+        _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
+        _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
+        _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
+    ]
+    phoneme_str = "pau k o N pau h i h O pau"
+    phoneme_data_list = [OjtPhoneme(p, 0, 0) for p in phoneme_str.split()]
+
+    # Ground Truths
+    #                 Pre k  o  N pau h  i  h  O Pst
+    frm_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3]
+    n_frm = sum(frm_per_phoneme)
+    frm_per_phoneme = numpy.array(frm_per_phoneme, dtype=numpy.int32)
+
+    #               Pr  k   o   o  N  N pau  h   i   i   h   h  O Pt Pt Pt
+    phoneme_frms = [0, 23, 30, 30, 4, 4, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0]
+    phoneme_gt = numpy.zeros([n_frm, 45], dtype=numpy.float32)
+    for frm_idx, phoneme_idx in enumerate(phoneme_frms):
+        phoneme_gt[frm_idx, phoneme_idx] = 1.0
+
+    # Pitch - x4 value & x0.5 variance
+    #        Pre   ko      N    pau   hi    hO   Pst
+    f0_gt = [0.0, 200.0, 200.0, 0.0, 500.0, 0.0, 0.0]  # mean 300
+    f0_gt = [0.0, 250.0, 250.0, 0.0, 400.0, 0.0, 0.0]  # intonationScale 0.5
+    #                paw ko  N pau hi hO paw
+    # frm_per_vowel = [1, 3,  2, 1, 3, 3, 3]
+    #          pau   ko     ko     ko      N      N
+    f0_gt_1 = [0.0, 250.0, 250.0, 250.0, 250.0, 250.0]
+    #          pau   hi     hi     hi
+    f0_gt_2 = [0.0, 400.0, 400.0, 400.0]
+    #          hO   hO   hO   paw  paw  paw
+    f0_gt_3 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    f0_gt = numpy.array(f0_gt_1 + f0_gt_2 + f0_gt_3, dtype=numpy.float32)
+
+    phoneme_pred, f0_pred = generate_frame_scale_features(
+        query, flatten_moras, phoneme_data_list
+    )
+
+    assert frm_per_phoneme.shape[0] == len(phoneme_data_list), "Prerequisites"
+
+    assert numpy.array_equal(phoneme_pred, phoneme_gt), "Wrong phoneme onehot frames"
+    assert numpy.array_equal(f0_pred, f0_gt), "Wrong frame-wise phoneme onehot"
+
+
 class TestSynthesisEngine(TestCase):
     def setUp(self):
         super().setUp()

diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py
@@ -127,6 +127,92 @@ def pre_process(
     return flatten_moras, phoneme_data_list
 
 
+def generate_frame_scale_features(
+    query: AudioQuery, flatten_moras: List[Mora], phoneme_data_list: List[OjtPhoneme]
+):
+    """
+    フレームごとの特徴量の生成
+    Parameters
+    ----------
+    query : List[AccentPhrase]
+        音声合成クエリ
+    flatten_moras : List[Mora]
+        モーラ列
+    phoneme_data_list : List[OjtPhoneme]
+        音素列
+    Returns
+    -------
+    phoneme : NDArray[]
+        フレームごとの音素onehotベクトル列
+    f0 : NDArray[]
+        フレームごとの基本周波数系列
+    """
+    # OjtPhonemeのリストからOjtPhonemeのPhoneme ID(OpenJTalkにおける音素のID)のリストを作る
+    phoneme_list_s = numpy.array(
+        [p.phoneme_id for p in phoneme_data_list], dtype=numpy.int64
+    )
+    # length
+    # 音素の長さをリストに展開・結合する。ここには前後の無音時間も含まれる
+    phoneme_length_list = (
+        [query.prePhonemeLength]
+        + [
+            length
+            for mora in flatten_moras
+            for length in (
+                [mora.consonant_length] if mora.consonant is not None else []
+            )
+            + [mora.vowel_length]
+        ]
+        + [query.postPhonemeLength]
+    )
+    # floatにキャスト
+    phoneme_length = numpy.array(phoneme_length_list, dtype=numpy.float32)
+
+    # lengthにSpeed Scale(話速)を適用する
+    phoneme_length /= query.speedScale
+
+    # pitch
+    # モーラの音高(ピッチ)を展開・結合し、floatにキャストする
+    f0_list = [0] + [mora.pitch for mora in flatten_moras] + [0]
+    f0 = numpy.array(f0_list, dtype=numpy.float32)
+    # 音高(ピッチ)の調節を適用する(2のPitch Scale乗を掛ける)
+    f0 *= 2**query.pitchScale
+
+    # 有声音素(音高(ピッチ)が0より大きいもの)か否かを抽出する
+    voiced = f0 > 0
+    # 有声音素の音高(ピッチ)の平均値を求める
+    mean_f0 = f0[voiced].mean()
+    # 平均値がNaNではないとき、抑揚を適用する
+    # 抑揚は音高と音高の平均値の差に抑揚を掛けたもの((f0 - mean_f0) * Intonation Scale)に抑揚の平均値(mean_f0)を足したもの
+    if not numpy.isnan(mean_f0):
+        f0[voiced] = (f0[voiced] - mean_f0) * query.intonationScale + mean_f0
+
+    # OjtPhonemeの形に分解された音素リストから、vowel(母音)の位置を抜き出し、numpyのarrayにする
+    _, _, vowel_indexes_data = split_mora(phoneme_data_list)
+    vowel_indexes = numpy.array(vowel_indexes_data)
+
+    # forward decode
+    # 音素の長さにrateを掛け、intにキャストする
+    rate = 24000 / 256  # framerate 93.75 [frame/sec]
+    phoneme_bin_num = numpy.round(phoneme_length * rate).astype(numpy.int32)
+
+    # Phoneme IDを音素の長さ分繰り返す
+    phoneme = numpy.repeat(phoneme_list_s, phoneme_bin_num)
+    # f0を母音と子音の長さの合計分繰り返す
+    f0 = numpy.repeat(
+        f0,
+        [a.sum() for a in numpy.split(phoneme_bin_num, vowel_indexes[:-1] + 1)],
+    )
+
+    # phonemeの長さとOjtPhonemeのnum_phoneme(45)分の0で初期化された2次元配列を用意する
+    array = numpy.zeros((len(phoneme), OjtPhoneme.num_phoneme), dtype=numpy.float32)
+    # 初期化された2次元配列の各行をone hotにする
+    array[numpy.arange(len(phoneme)), phoneme] = 1
+    phoneme = array
+
+    return phoneme, f0
+
+
 class SynthesisEngine(SynthesisEngineBase):
     def __init__(
         self,
@@ -410,69 +496,9 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
         # AccentPhraseをすべてMoraおよびOjtPhonemeの形に分解し、処理可能な形にする
         flatten_moras, phoneme_data_list = pre_process(query.accent_phrases)
 
-        # OjtPhonemeのリストからOjtPhonemeのPhoneme ID(OpenJTalkにおける音素のID)のリストを作る
-        phoneme_list_s = numpy.array(
-            [p.phoneme_id for p in phoneme_data_list], dtype=numpy.int64
-        )
-
-        # length
-        # 音素の長さをリストに展開・結合する。ここには前後の無音時間も含まれる
-        phoneme_length_list = (
-            [query.prePhonemeLength]
-            + [
-                length
-                for mora in flatten_moras
-                for length in (
-                    [mora.consonant_length] if mora.consonant is not None else []
-                )
-                + [mora.vowel_length]
-            ]
-            + [query.postPhonemeLength]
+        phoneme, f0 = generate_frame_scale_features(
+            query, flatten_moras, phoneme_data_list
         )
-        # floatにキャスト
-        phoneme_length = numpy.array(phoneme_length_list, dtype=numpy.float32)
-
-        # lengthにSpeed Scale(話速)を適用する
-        phoneme_length /= query.speedScale
-
-        # pitch
-        # モーラの音高(ピッチ)を展開・結合し、floatにキャストする
-        f0_list = [0] + [mora.pitch for mora in flatten_moras] + [0]
-        f0 = numpy.array(f0_list, dtype=numpy.float32)
-        # 音高(ピッチ)の調節を適用する(2のPitch Scale乗を掛ける)
-        f0 *= 2**query.pitchScale
-
-        # 有声音素(音高(ピッチ)が0より大きいもの)か否かを抽出する
-        voiced = f0 > 0
-        # 有声音素の音高(ピッチ)の平均値を求める
-        mean_f0 = f0[voiced].mean()
-        # 平均値がNaNではないとき、抑揚を適用する
-        # 抑揚は音高と音高の平均値の差に抑揚を掛けたもの((f0 - mean_f0) * Intonation Scale)に抑揚の平均値(mean_f0)を足したもの
-        if not numpy.isnan(mean_f0):
-            f0[voiced] = (f0[voiced] - mean_f0) * query.intonationScale + mean_f0
-
-        # OjtPhonemeの形に分解された音素リストから、vowel(母音)の位置を抜き出し、numpyのarrayにする
-        _, _, vowel_indexes_data = split_mora(phoneme_data_list)
-        vowel_indexes = numpy.array(vowel_indexes_data)
-
-        # forward decode
-        # 音素の長さにrateを掛け、intにキャストする
-        rate = 24000 / 256
-        phoneme_bin_num = numpy.round(phoneme_length * rate).astype(numpy.int32)
-
-        # Phoneme IDを音素の長さ分繰り返す
-        phoneme = numpy.repeat(phoneme_list_s, phoneme_bin_num)
-        # f0を母音と子音の長さの合計分繰り返す
-        f0 = numpy.repeat(
-            f0,
-            [a.sum() for a in numpy.split(phoneme_bin_num, vowel_indexes[:-1] + 1)],
-        )
-
-        # phonemeの長さとOjtPhonemeのnum_phoneme(45)分の0で初期化された2次元配列を用意する
-        array = numpy.zeros((len(phoneme), OjtPhoneme.num_phoneme), dtype=numpy.float32)
-        # 初期化された2次元配列の各行をone hotにする
-        array[numpy.arange(len(phoneme)), phoneme] = 1
-        phoneme = array
 
         # 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する
         with self.mutex: