Skip to content

Commit

Permalink
_synthesis_impl() 前処理の関数化 (#784)
Browse files Browse the repository at this point in the history
Co-authored-by: Hiroshiba <[email protected]>
  • Loading branch information
tarepan and Hiroshiba authored Nov 28, 2023
1 parent bad1209 commit cb33ffa
Show file tree
Hide file tree
Showing 2 changed files with 168 additions and 63 deletions.
81 changes: 80 additions & 1 deletion test/test_synthesis_engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import math
from copy import deepcopy
from random import random
from typing import Union
from typing import Optional, Union
from unittest import TestCase
from unittest.mock import Mock

Expand All @@ -13,6 +13,7 @@

# TODO: import from voicevox_engine.synthesis_engine.mora
from voicevox_engine.synthesis_engine.synthesis_engine import (
generate_frame_scale_features,
mora_phoneme_list,
pre_process,
split_mora,
Expand Down Expand Up @@ -96,6 +97,84 @@ def is_model_loaded(self, style_id):
return True


def _gen_mora(
text: str,
consonant: Optional[str],
consonant_length: Optional[float],
vowel: str,
vowel_length: float,
pitch: float,
) -> Mora:
return Mora(
text=text,
consonant=consonant,
consonant_length=consonant_length,
vowel=vowel,
vowel_length=vowel_length,
pitch=pitch,
)


def test_generate_frame_scale_features():
"""Test `generate_frame_scale_features`."""
# Inputs
query = AudioQuery(
accent_phrases=[],
speedScale=2.0,
pitchScale=2.0,
intonationScale=0.5,
prePhonemeLength=2 * 0.01067, # 0.01067 [sec/frame]
postPhonemeLength=6 * 0.01067,
volumeScale=0.0,
outputSamplingRate=0,
outputStereo=False,
)
flatten_moras = [
_gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
_gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
_gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
_gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
]
phoneme_str = "pau k o N pau h i h O pau"
phoneme_data_list = [OjtPhoneme(p, 0, 0) for p in phoneme_str.split()]

# Ground Truths
# Pre k o N pau h i h O Pst
frm_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3]
n_frm = sum(frm_per_phoneme)
frm_per_phoneme = numpy.array(frm_per_phoneme, dtype=numpy.int32)

# Pr k o o N N pau h i i h h O Pt Pt Pt
phoneme_frms = [0, 23, 30, 30, 4, 4, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0]
phoneme_gt = numpy.zeros([n_frm, 45], dtype=numpy.float32)
for frm_idx, phoneme_idx in enumerate(phoneme_frms):
phoneme_gt[frm_idx, phoneme_idx] = 1.0

# Pitch - x4 value & x0.5 variance
# Pre ko N pau hi hO Pst
f0_gt = [0.0, 200.0, 200.0, 0.0, 500.0, 0.0, 0.0] # mean 300
f0_gt = [0.0, 250.0, 250.0, 0.0, 400.0, 0.0, 0.0] # intonationScale 0.5
# paw ko N pau hi hO paw
# frm_per_vowel = [1, 3, 2, 1, 3, 3, 3]
# pau ko ko ko N N
f0_gt_1 = [0.0, 250.0, 250.0, 250.0, 250.0, 250.0]
# pau hi hi hi
f0_gt_2 = [0.0, 400.0, 400.0, 400.0]
# hO hO hO paw paw paw
f0_gt_3 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
f0_gt = numpy.array(f0_gt_1 + f0_gt_2 + f0_gt_3, dtype=numpy.float32)

phoneme_pred, f0_pred = generate_frame_scale_features(
query, flatten_moras, phoneme_data_list
)

assert frm_per_phoneme.shape[0] == len(phoneme_data_list), "Prerequisites"

assert numpy.array_equal(phoneme_pred, phoneme_gt), "Wrong phoneme onehot frames"
assert numpy.array_equal(f0_pred, f0_gt), "Wrong frame-wise phoneme onehot"


class TestSynthesisEngine(TestCase):
def setUp(self):
super().setUp()
Expand Down
150 changes: 88 additions & 62 deletions voicevox_engine/synthesis_engine/synthesis_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,92 @@ def pre_process(
return flatten_moras, phoneme_data_list


def generate_frame_scale_features(
query: AudioQuery, flatten_moras: List[Mora], phoneme_data_list: List[OjtPhoneme]
):
"""
フレームごとの特徴量の生成
Parameters
----------
query : List[AccentPhrase]
音声合成クエリ
flatten_moras : List[Mora]
モーラ列
phoneme_data_list : List[OjtPhoneme]
音素列
Returns
-------
phoneme : NDArray[]
フレームごとの音素onehotベクトル列
f0 : NDArray[]
フレームごとの基本周波数系列
"""
# OjtPhonemeのリストからOjtPhonemeのPhoneme ID(OpenJTalkにおける音素のID)のリストを作る
phoneme_list_s = numpy.array(
[p.phoneme_id for p in phoneme_data_list], dtype=numpy.int64
)
# length
# 音素の長さをリストに展開・結合する。ここには前後の無音時間も含まれる
phoneme_length_list = (
[query.prePhonemeLength]
+ [
length
for mora in flatten_moras
for length in (
[mora.consonant_length] if mora.consonant is not None else []
)
+ [mora.vowel_length]
]
+ [query.postPhonemeLength]
)
# floatにキャスト
phoneme_length = numpy.array(phoneme_length_list, dtype=numpy.float32)

# lengthにSpeed Scale(話速)を適用する
phoneme_length /= query.speedScale

# pitch
# モーラの音高(ピッチ)を展開・結合し、floatにキャストする
f0_list = [0] + [mora.pitch for mora in flatten_moras] + [0]
f0 = numpy.array(f0_list, dtype=numpy.float32)
# 音高(ピッチ)の調節を適用する(2のPitch Scale乗を掛ける)
f0 *= 2**query.pitchScale

# 有声音素(音高(ピッチ)が0より大きいもの)か否かを抽出する
voiced = f0 > 0
# 有声音素の音高(ピッチ)の平均値を求める
mean_f0 = f0[voiced].mean()
# 平均値がNaNではないとき、抑揚を適用する
# 抑揚は音高と音高の平均値の差に抑揚を掛けたもの((f0 - mean_f0) * Intonation Scale)に抑揚の平均値(mean_f0)を足したもの
if not numpy.isnan(mean_f0):
f0[voiced] = (f0[voiced] - mean_f0) * query.intonationScale + mean_f0

# OjtPhonemeの形に分解された音素リストから、vowel(母音)の位置を抜き出し、numpyのarrayにする
_, _, vowel_indexes_data = split_mora(phoneme_data_list)
vowel_indexes = numpy.array(vowel_indexes_data)

# forward decode
# 音素の長さにrateを掛け、intにキャストする
rate = 24000 / 256 # framerate 93.75 [frame/sec]
phoneme_bin_num = numpy.round(phoneme_length * rate).astype(numpy.int32)

# Phoneme IDを音素の長さ分繰り返す
phoneme = numpy.repeat(phoneme_list_s, phoneme_bin_num)
# f0を母音と子音の長さの合計分繰り返す
f0 = numpy.repeat(
f0,
[a.sum() for a in numpy.split(phoneme_bin_num, vowel_indexes[:-1] + 1)],
)

# phonemeの長さとOjtPhonemeのnum_phoneme(45)分の0で初期化された2次元配列を用意する
array = numpy.zeros((len(phoneme), OjtPhoneme.num_phoneme), dtype=numpy.float32)
# 初期化された2次元配列の各行をone hotにする
array[numpy.arange(len(phoneme)), phoneme] = 1
phoneme = array

return phoneme, f0


class SynthesisEngine(SynthesisEngineBase):
def __init__(
self,
Expand Down Expand Up @@ -410,69 +496,9 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
# AccentPhraseをすべてMoraおよびOjtPhonemeの形に分解し、処理可能な形にする
flatten_moras, phoneme_data_list = pre_process(query.accent_phrases)

# OjtPhonemeのリストからOjtPhonemeのPhoneme ID(OpenJTalkにおける音素のID)のリストを作る
phoneme_list_s = numpy.array(
[p.phoneme_id for p in phoneme_data_list], dtype=numpy.int64
)

# length
# 音素の長さをリストに展開・結合する。ここには前後の無音時間も含まれる
phoneme_length_list = (
[query.prePhonemeLength]
+ [
length
for mora in flatten_moras
for length in (
[mora.consonant_length] if mora.consonant is not None else []
)
+ [mora.vowel_length]
]
+ [query.postPhonemeLength]
phoneme, f0 = generate_frame_scale_features(
query, flatten_moras, phoneme_data_list
)
# floatにキャスト
phoneme_length = numpy.array(phoneme_length_list, dtype=numpy.float32)

# lengthにSpeed Scale(話速)を適用する
phoneme_length /= query.speedScale

# pitch
# モーラの音高(ピッチ)を展開・結合し、floatにキャストする
f0_list = [0] + [mora.pitch for mora in flatten_moras] + [0]
f0 = numpy.array(f0_list, dtype=numpy.float32)
# 音高(ピッチ)の調節を適用する(2のPitch Scale乗を掛ける)
f0 *= 2**query.pitchScale

# 有声音素(音高(ピッチ)が0より大きいもの)か否かを抽出する
voiced = f0 > 0
# 有声音素の音高(ピッチ)の平均値を求める
mean_f0 = f0[voiced].mean()
# 平均値がNaNではないとき、抑揚を適用する
# 抑揚は音高と音高の平均値の差に抑揚を掛けたもの((f0 - mean_f0) * Intonation Scale)に抑揚の平均値(mean_f0)を足したもの
if not numpy.isnan(mean_f0):
f0[voiced] = (f0[voiced] - mean_f0) * query.intonationScale + mean_f0

# OjtPhonemeの形に分解された音素リストから、vowel(母音)の位置を抜き出し、numpyのarrayにする
_, _, vowel_indexes_data = split_mora(phoneme_data_list)
vowel_indexes = numpy.array(vowel_indexes_data)

# forward decode
# 音素の長さにrateを掛け、intにキャストする
rate = 24000 / 256
phoneme_bin_num = numpy.round(phoneme_length * rate).astype(numpy.int32)

# Phoneme IDを音素の長さ分繰り返す
phoneme = numpy.repeat(phoneme_list_s, phoneme_bin_num)
# f0を母音と子音の長さの合計分繰り返す
f0 = numpy.repeat(
f0,
[a.sum() for a in numpy.split(phoneme_bin_num, vowel_indexes[:-1] + 1)],
)

# phonemeの長さとOjtPhonemeのnum_phoneme(45)分の0で初期化された2次元配列を用意する
array = numpy.zeros((len(phoneme), OjtPhoneme.num_phoneme), dtype=numpy.float32)
# 初期化された2次元配列の各行をone hotにする
array[numpy.arange(len(phoneme)), phoneme] = 1
phoneme = array

# 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する
with self.mutex:
Expand Down

0 comments on commit cb33ffa

Please sign in to comment.