diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index c7d992293..e5515bd09 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -1,7 +1,7 @@ import math from copy import deepcopy from random import random -from typing import Union +from typing import Optional, Union from unittest import TestCase from unittest.mock import Mock @@ -13,6 +13,7 @@ # TODO: import from voicevox_engine.synthesis_engine.mora from voicevox_engine.synthesis_engine.synthesis_engine import ( + generate_frame_scale_features, mora_phoneme_list, pre_process, split_mora, @@ -96,6 +97,84 @@ def is_model_loaded(self, style_id): return True +def _gen_mora( + text: str, + consonant: Optional[str], + consonant_length: Optional[float], + vowel: str, + vowel_length: float, + pitch: float, +) -> Mora: + return Mora( + text=text, + consonant=consonant, + consonant_length=consonant_length, + vowel=vowel, + vowel_length=vowel_length, + pitch=pitch, + ) + + +def test_generate_frame_scale_features(): + """Test `generate_frame_scale_features`.""" + # Inputs + query = AudioQuery( + accent_phrases=[], + speedScale=2.0, + pitchScale=2.0, + intonationScale=0.5, + prePhonemeLength=2 * 0.01067, # 0.01067 [sec/frame] + postPhonemeLength=6 * 0.01067, + volumeScale=0.0, + outputSamplingRate=0, + outputStereo=False, + ) + flatten_moras = [ + _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), + _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), + _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), + _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + ] + phoneme_str = "pau k o N pau h i h O pau" + phoneme_data_list = [OjtPhoneme(p, 0, 0) for p in phoneme_str.split()] + + # Ground Truths + # Pre k o N pau h i h O Pst + frm_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3] + n_frm = sum(frm_per_phoneme) + frm_per_phoneme = numpy.array(frm_per_phoneme, dtype=numpy.int32) + + # Pr k o o N N pau h i i h h O Pt Pt Pt + phoneme_frms = [0, 23, 30, 30, 4, 4, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0] + phoneme_gt = numpy.zeros([n_frm, 45], dtype=numpy.float32) + for frm_idx, phoneme_idx in enumerate(phoneme_frms): + phoneme_gt[frm_idx, phoneme_idx] = 1.0 + + # Pitch - x4 value & x0.5 variance + # Pre ko N pau hi hO Pst + f0_gt = [0.0, 200.0, 200.0, 0.0, 500.0, 0.0, 0.0] # mean 300 + f0_gt = [0.0, 250.0, 250.0, 0.0, 400.0, 0.0, 0.0] # intonationScale 0.5 + # paw ko N pau hi hO paw + # frm_per_vowel = [1, 3, 2, 1, 3, 3, 3] + # pau ko ko ko N N + f0_gt_1 = [0.0, 250.0, 250.0, 250.0, 250.0, 250.0] + # pau hi hi hi + f0_gt_2 = [0.0, 400.0, 400.0, 400.0] + # hO hO hO paw paw paw + f0_gt_3 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] + f0_gt = numpy.array(f0_gt_1 + f0_gt_2 + f0_gt_3, dtype=numpy.float32) + + phoneme_pred, f0_pred = generate_frame_scale_features( + query, flatten_moras, phoneme_data_list + ) + + assert frm_per_phoneme.shape[0] == len(phoneme_data_list), "Prerequisites" + + assert numpy.array_equal(phoneme_pred, phoneme_gt), "Wrong phoneme onehot frames" + assert numpy.array_equal(f0_pred, f0_gt), "Wrong frame-wise phoneme onehot" + + class TestSynthesisEngine(TestCase): def setUp(self): super().setUp() diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index 81cad3d44..c44f3d553 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -127,6 +127,92 @@ def pre_process( return flatten_moras, phoneme_data_list +def generate_frame_scale_features( + query: AudioQuery, flatten_moras: List[Mora], phoneme_data_list: List[OjtPhoneme] +): + """ + フレームごとの特徴量の生成 + Parameters + ---------- + query : List[AccentPhrase] + 音声合成クエリ + flatten_moras : List[Mora] + モーラ列 + phoneme_data_list : List[OjtPhoneme] + 音素列 + Returns + ------- + phoneme : NDArray[] + フレームごとの音素onehotベクトル列 + f0 : NDArray[] + フレームごとの基本周波数系列 + """ + # OjtPhonemeのリストからOjtPhonemeのPhoneme ID(OpenJTalkにおける音素のID)のリストを作る + phoneme_list_s = numpy.array( + [p.phoneme_id for p in phoneme_data_list], dtype=numpy.int64 + ) + # length + # 音素の長さをリストに展開・結合する。ここには前後の無音時間も含まれる + phoneme_length_list = ( + [query.prePhonemeLength] + + [ + length + for mora in flatten_moras + for length in ( + [mora.consonant_length] if mora.consonant is not None else [] + ) + + [mora.vowel_length] + ] + + [query.postPhonemeLength] + ) + # floatにキャスト + phoneme_length = numpy.array(phoneme_length_list, dtype=numpy.float32) + + # lengthにSpeed Scale(話速)を適用する + phoneme_length /= query.speedScale + + # pitch + # モーラの音高(ピッチ)を展開・結合し、floatにキャストする + f0_list = [0] + [mora.pitch for mora in flatten_moras] + [0] + f0 = numpy.array(f0_list, dtype=numpy.float32) + # 音高(ピッチ)の調節を適用する(2のPitch Scale乗を掛ける) + f0 *= 2**query.pitchScale + + # 有声音素(音高(ピッチ)が0より大きいもの)か否かを抽出する + voiced = f0 > 0 + # 有声音素の音高(ピッチ)の平均値を求める + mean_f0 = f0[voiced].mean() + # 平均値がNaNではないとき、抑揚を適用する + # 抑揚は音高と音高の平均値の差に抑揚を掛けたもの((f0 - mean_f0) * Intonation Scale)に抑揚の平均値(mean_f0)を足したもの + if not numpy.isnan(mean_f0): + f0[voiced] = (f0[voiced] - mean_f0) * query.intonationScale + mean_f0 + + # OjtPhonemeの形に分解された音素リストから、vowel(母音)の位置を抜き出し、numpyのarrayにする + _, _, vowel_indexes_data = split_mora(phoneme_data_list) + vowel_indexes = numpy.array(vowel_indexes_data) + + # forward decode + # 音素の長さにrateを掛け、intにキャストする + rate = 24000 / 256 # framerate 93.75 [frame/sec] + phoneme_bin_num = numpy.round(phoneme_length * rate).astype(numpy.int32) + + # Phoneme IDを音素の長さ分繰り返す + phoneme = numpy.repeat(phoneme_list_s, phoneme_bin_num) + # f0を母音と子音の長さの合計分繰り返す + f0 = numpy.repeat( + f0, + [a.sum() for a in numpy.split(phoneme_bin_num, vowel_indexes[:-1] + 1)], + ) + + # phonemeの長さとOjtPhonemeのnum_phoneme(45)分の0で初期化された2次元配列を用意する + array = numpy.zeros((len(phoneme), OjtPhoneme.num_phoneme), dtype=numpy.float32) + # 初期化された2次元配列の各行をone hotにする + array[numpy.arange(len(phoneme)), phoneme] = 1 + phoneme = array + + return phoneme, f0 + + class SynthesisEngine(SynthesisEngineBase): def __init__( self, @@ -410,69 +496,9 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int): # AccentPhraseをすべてMoraおよびOjtPhonemeの形に分解し、処理可能な形にする flatten_moras, phoneme_data_list = pre_process(query.accent_phrases) - # OjtPhonemeのリストからOjtPhonemeのPhoneme ID(OpenJTalkにおける音素のID)のリストを作る - phoneme_list_s = numpy.array( - [p.phoneme_id for p in phoneme_data_list], dtype=numpy.int64 - ) - - # length - # 音素の長さをリストに展開・結合する。ここには前後の無音時間も含まれる - phoneme_length_list = ( - [query.prePhonemeLength] - + [ - length - for mora in flatten_moras - for length in ( - [mora.consonant_length] if mora.consonant is not None else [] - ) - + [mora.vowel_length] - ] - + [query.postPhonemeLength] + phoneme, f0 = generate_frame_scale_features( + query, flatten_moras, phoneme_data_list ) - # floatにキャスト - phoneme_length = numpy.array(phoneme_length_list, dtype=numpy.float32) - - # lengthにSpeed Scale(話速)を適用する - phoneme_length /= query.speedScale - - # pitch - # モーラの音高(ピッチ)を展開・結合し、floatにキャストする - f0_list = [0] + [mora.pitch for mora in flatten_moras] + [0] - f0 = numpy.array(f0_list, dtype=numpy.float32) - # 音高(ピッチ)の調節を適用する(2のPitch Scale乗を掛ける) - f0 *= 2**query.pitchScale - - # 有声音素(音高(ピッチ)が0より大きいもの)か否かを抽出する - voiced = f0 > 0 - # 有声音素の音高(ピッチ)の平均値を求める - mean_f0 = f0[voiced].mean() - # 平均値がNaNではないとき、抑揚を適用する - # 抑揚は音高と音高の平均値の差に抑揚を掛けたもの((f0 - mean_f0) * Intonation Scale)に抑揚の平均値(mean_f0)を足したもの - if not numpy.isnan(mean_f0): - f0[voiced] = (f0[voiced] - mean_f0) * query.intonationScale + mean_f0 - - # OjtPhonemeの形に分解された音素リストから、vowel(母音)の位置を抜き出し、numpyのarrayにする - _, _, vowel_indexes_data = split_mora(phoneme_data_list) - vowel_indexes = numpy.array(vowel_indexes_data) - - # forward decode - # 音素の長さにrateを掛け、intにキャストする - rate = 24000 / 256 - phoneme_bin_num = numpy.round(phoneme_length * rate).astype(numpy.int32) - - # Phoneme IDを音素の長さ分繰り返す - phoneme = numpy.repeat(phoneme_list_s, phoneme_bin_num) - # f0を母音と子音の長さの合計分繰り返す - f0 = numpy.repeat( - f0, - [a.sum() for a in numpy.split(phoneme_bin_num, vowel_indexes[:-1] + 1)], - ) - - # phonemeの長さとOjtPhonemeのnum_phoneme(45)分の0で初期化された2次元配列を用意する - array = numpy.zeros((len(phoneme), OjtPhoneme.num_phoneme), dtype=numpy.float32) - # 初期化された2次元配列の各行をone hotにする - array[numpy.arange(len(phoneme)), phoneme] = 1 - phoneme = array # 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する with self.mutex: