From 6ff13e0e47056416bd1e6e83d8b37aa41e89148c Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sat, 4 Jan 2025 17:23:26 +0900 Subject: [PATCH] =?UTF-8?q?refactor:=20=E5=85=A5=E5=87=BA=E5=8A=9B?= =?UTF-8?q?=E5=91=A8=E3=82=8A=E3=81=AE=E3=81=84=E3=81=8F=E3=81=A4=E3=81=8B?= =?UTF-8?q?=E3=81=AE=E5=87=A6=E7=90=86=E3=82=92`synthesizer`=E3=81=8B?= =?UTF-8?q?=E3=82=89=E7=A7=BB=E5=8B=95=20(#917)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #916 をやるためのリファクタ。 --- crates/voicevox_core/src/core.rs | 3 + crates/voicevox_core/src/core/adjust.rs | 6 + crates/voicevox_core/src/core/adjust/post.rs | 12 + crates/voicevox_core/src/core/adjust/pre.rs | 37 ++ crates/voicevox_core/src/engine/audio_file.rs | 28 ++ .../src/engine/interpret_query.rs | 243 +++++++++++++ crates/voicevox_core/src/engine/mod.rs | 7 +- crates/voicevox_core/src/lib.rs | 1 + crates/voicevox_core/src/synthesizer.rs | 330 ++---------------- 9 files changed, 354 insertions(+), 313 deletions(-) create mode 100644 crates/voicevox_core/src/core.rs create mode 100644 crates/voicevox_core/src/core/adjust.rs create mode 100644 crates/voicevox_core/src/core/adjust/post.rs create mode 100644 crates/voicevox_core/src/core/adjust/pre.rs create mode 100644 crates/voicevox_core/src/engine/interpret_query.rs diff --git a/crates/voicevox_core/src/core.rs b/crates/voicevox_core/src/core.rs new file mode 100644 index 000000000..67adb7a23 --- /dev/null +++ b/crates/voicevox_core/src/core.rs @@ -0,0 +1,3 @@ +mod adjust; + +pub(crate) use self::adjust::{ensure_minimum_phoneme_length, pad_decoder_feature}; diff --git a/crates/voicevox_core/src/core/adjust.rs b/crates/voicevox_core/src/core/adjust.rs new file mode 100644 index 000000000..6ab01c323 --- /dev/null +++ b/crates/voicevox_core/src/core/adjust.rs @@ -0,0 +1,6 @@ +//! 推論操作の前処理と後処理。 + +mod post; +mod pre; + +pub(crate) use self::{post::ensure_minimum_phoneme_length, pre::pad_decoder_feature}; diff --git a/crates/voicevox_core/src/core/adjust/post.rs b/crates/voicevox_core/src/core/adjust/post.rs new file mode 100644 index 000000000..11667c09e --- /dev/null +++ b/crates/voicevox_core/src/core/adjust/post.rs @@ -0,0 +1,12 @@ +//! 推論の出力の後処理。 + +pub(crate) fn ensure_minimum_phoneme_length(mut output: Vec) -> Vec { + const PHONEME_LENGTH_MINIMAL: f32 = 0.01; + + for output_item in output.iter_mut() { + if *output_item < PHONEME_LENGTH_MINIMAL { + *output_item = PHONEME_LENGTH_MINIMAL; + } + } + output +} diff --git a/crates/voicevox_core/src/core/adjust/pre.rs b/crates/voicevox_core/src/core/adjust/pre.rs new file mode 100644 index 000000000..1e0036ab0 --- /dev/null +++ b/crates/voicevox_core/src/core/adjust/pre.rs @@ -0,0 +1,37 @@ +//! 推論の入力の前処理。 + +/// 音が途切れてしまうのを避けるworkaround処理。 +// TODO: 改善したらここのpadding処理を取り除く +pub(crate) fn pad_decoder_feature( + f0: ndarray::Array1, + phoneme: ndarray::Array2, +) -> (usize, ndarray::Array1, ndarray::Array2) { + let start_and_end_padding_size = 2 * PADDING_FRAME_LENGTH; + let length_with_padding = f0.len() + start_and_end_padding_size; + let f0_with_padding = make_f0_with_padding(f0, PADDING_FRAME_LENGTH); + let phoneme_with_padding = make_phoneme_with_padding(phoneme, PADDING_FRAME_LENGTH); + return (length_with_padding, f0_with_padding, phoneme_with_padding); + + fn make_f0_with_padding( + f0_slice: ndarray::Array1, + padding_size: usize, + ) -> ndarray::Array1 { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let padding = ndarray::Array1::::zeros(padding_size); + ndarray::concatenate![ndarray::Axis(0), padding, f0_slice, padding] + } + + fn make_phoneme_with_padding( + phoneme_slice: ndarray::Array2, + padding_size: usize, + ) -> ndarray::Array2 { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let mut padding = ndarray::Array2::::zeros((padding_size, phoneme_slice.ncols())); + padding + .slice_mut(ndarray::s![.., 0]) + .assign(&ndarray::arr0(1.0)); + ndarray::concatenate![ndarray::Axis(0), padding, phoneme_slice, padding] + } +} diff --git a/crates/voicevox_core/src/engine/audio_file.rs b/crates/voicevox_core/src/engine/audio_file.rs index 470f5fc84..ba7dadf63 100644 --- a/crates/voicevox_core/src/engine/audio_file.rs +++ b/crates/voicevox_core/src/engine/audio_file.rs @@ -1,5 +1,33 @@ use std::io::{Cursor, Write as _}; +use super::AudioQuery; + +pub(crate) fn to_s16le_pcm( + wave: &[f32], + &AudioQuery { + volume_scale, + output_sampling_rate, + output_stereo, + .. + }: &AudioQuery, +) -> Vec { + let num_channels: u16 = if output_stereo { 2 } else { 1 }; + let repeat_count: u32 = (output_sampling_rate / BASE_SAMPLING_RATE) * num_channels as u32; + let bytes_size = wave.len() as u32 * repeat_count * 2; + let buf: Vec = Vec::with_capacity(bytes_size as usize); + let mut cur = Cursor::new(buf); + + for value in wave { + let v = (value * volume_scale).clamp(-1., 1.); + let data = (v * 0x7fff as f32) as i16; + for _ in 0..repeat_count { + cur.write_all(&data.to_le_bytes()).unwrap(); + } + } + + cur.into_inner() +} + /// 16bit PCMにヘッダを付加しWAVフォーマットのバイナリを生成する。 pub fn wav_from_s16le(pcm: &[u8], sampling_rate: u32, is_stereo: bool) -> Vec { let num_channels: u16 = if is_stereo { 2 } else { 1 }; diff --git a/crates/voicevox_core/src/engine/interpret_query.rs b/crates/voicevox_core/src/engine/interpret_query.rs new file mode 100644 index 000000000..841364114 --- /dev/null +++ b/crates/voicevox_core/src/engine/interpret_query.rs @@ -0,0 +1,243 @@ +//! [`AudioQuery`]から特徴量を取り出す処理を集めたもの。 + +use super::{full_context_label::mora_to_text, AccentPhrase, AudioQuery, Mora, OjtPhoneme}; + +pub(crate) fn initial_process(accent_phrases: &[AccentPhrase]) -> (Vec, Vec) { + let flatten_moras = to_flatten_moras(accent_phrases); + + let mut phoneme_strings = vec!["pau".to_string()]; + for mora in flatten_moras.iter() { + if let Some(consonant) = &mora.consonant { + phoneme_strings.push(consonant.clone()) + } + phoneme_strings.push(mora.vowel.clone()); + } + phoneme_strings.push("pau".to_string()); + + let phoneme_data_list = to_phoneme_data_list(&phoneme_strings); + + return (flatten_moras, phoneme_data_list); + + fn to_flatten_moras(accent_phrases: &[AccentPhrase]) -> Vec { + let mut flatten_moras = Vec::new(); + + for AccentPhrase { + moras, pause_mora, .. + } in accent_phrases + { + for mora in moras { + flatten_moras.push(mora.clone()); + } + if let Some(pause_mora) = pause_mora { + flatten_moras.push(pause_mora.clone()); + } + } + + flatten_moras + } + + fn to_phoneme_data_list>(phoneme_str_list: &[T]) -> Vec { + OjtPhoneme::convert( + phoneme_str_list + .iter() + .map(AsRef::as_ref) + .map(ToOwned::to_owned) + .map(OjtPhoneme::new) + .collect::>() + .as_slice(), + ) + } +} + +pub(crate) fn split_mora( + phoneme_list: &[OjtPhoneme], +) -> (Vec, Vec, Vec) { + let mut vowel_indexes = Vec::new(); + for (i, phoneme) in phoneme_list.iter().enumerate() { + const MORA_PHONEME_LIST: &[&str] = &[ + "a", "i", "u", "e", "o", "N", "A", "I", "U", "E", "O", "cl", "pau", + ]; + + if MORA_PHONEME_LIST + .iter() + .any(|mora_phoneme| *mora_phoneme == phoneme.phoneme()) + { + vowel_indexes.push(i as i64); + } + } + + let vowel_phoneme_list = vowel_indexes + .iter() + .map(|vowel_index| phoneme_list[*vowel_index as usize].clone()) + .collect(); + + let mut consonant_phoneme_list = vec![OjtPhoneme::default()]; + for i in 0..(vowel_indexes.len() - 1) { + let prev = vowel_indexes[i]; + let next = vowel_indexes[i + 1]; + if next - prev == 1 { + consonant_phoneme_list.push(OjtPhoneme::default()); + } else { + consonant_phoneme_list.push(phoneme_list[next as usize - 1].clone()); + } + } + + (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes) +} + +pub(crate) struct DecoderFeature { + pub(crate) f0: Vec, + pub(crate) phoneme: Vec<[f32; OjtPhoneme::num_phoneme()]>, +} + +impl AudioQuery { + pub(crate) fn decoder_feature(&self, enable_interrogative_upspeak: bool) -> DecoderFeature { + let AudioQuery { + accent_phrases, + speed_scale, + pitch_scale, + intonation_scale, + pre_phoneme_length, + post_phoneme_length, + .. + } = self; + + let accent_phrases = if enable_interrogative_upspeak { + &adjust_interrogative_accent_phrases(accent_phrases) + } else { + accent_phrases + }; + + let (flatten_moras, phoneme_data_list) = initial_process(accent_phrases); + + let mut phoneme_length_list = vec![*pre_phoneme_length]; + let mut f0_list = vec![0.]; + let mut voiced_list = vec![false]; + { + let mut sum_of_f0_bigger_than_zero = 0.; + let mut count_of_f0_bigger_than_zero = 0; + + for Mora { + consonant_length, + vowel_length, + pitch, + .. + } in flatten_moras + { + if let Some(consonant_length) = consonant_length { + phoneme_length_list.push(consonant_length); + } + phoneme_length_list.push(vowel_length); + + let f0_single = pitch * 2.0_f32.powf(*pitch_scale); + f0_list.push(f0_single); + + let bigger_than_zero = f0_single > 0.; + voiced_list.push(bigger_than_zero); + + if bigger_than_zero { + sum_of_f0_bigger_than_zero += f0_single; + count_of_f0_bigger_than_zero += 1; + } + } + phoneme_length_list.push(*post_phoneme_length); + f0_list.push(0.); + voiced_list.push(false); + let mean_f0 = sum_of_f0_bigger_than_zero / (count_of_f0_bigger_than_zero as f32); + + if !mean_f0.is_nan() { + for i in 0..f0_list.len() { + if voiced_list[i] { + f0_list[i] = (f0_list[i] - mean_f0) * intonation_scale + mean_f0; + } + } + } + } + + let (_, _, vowel_indexes) = split_mora(&phoneme_data_list); + + let mut phoneme = Vec::new(); + let mut f0: Vec = Vec::new(); + { + const RATE: f32 = 24000. / 256.; + let mut sum_of_phoneme_length = 0; + let mut count_of_f0 = 0; + let mut vowel_indexes_index = 0; + + for (i, phoneme_length) in phoneme_length_list.iter().enumerate() { + // VOICEVOX ENGINEと挙動を合わせるため、四捨五入ではなく偶数丸めをする + // + // https://github.com/VOICEVOX/voicevox_engine/issues/552 + let phoneme_length = ((*phoneme_length * RATE).round_ties_even() / speed_scale) + .round_ties_even() as usize; + let phoneme_id = phoneme_data_list[i].phoneme_id(); + + for _ in 0..phoneme_length { + let mut phonemes_vec = [0.; OjtPhoneme::num_phoneme()]; + phonemes_vec[phoneme_id as usize] = 1.; + phoneme.push(phonemes_vec) + } + sum_of_phoneme_length += phoneme_length; + + if i as i64 == vowel_indexes[vowel_indexes_index] { + for _ in 0..sum_of_phoneme_length { + f0.push(f0_list[count_of_f0]); + } + count_of_f0 += 1; + sum_of_phoneme_length = 0; + vowel_indexes_index += 1; + } + } + } + return DecoderFeature { f0, phoneme }; + + fn adjust_interrogative_accent_phrases( + accent_phrases: &[AccentPhrase], + ) -> Vec { + accent_phrases + .iter() + .map(|accent_phrase| AccentPhrase { + moras: adjust_interrogative_moras(accent_phrase), + ..accent_phrase.clone() + }) + .collect() + } + + fn adjust_interrogative_moras( + AccentPhrase { + moras, + is_interrogative, + .. + }: &AccentPhrase, + ) -> Vec { + if *is_interrogative && !moras.is_empty() { + let last_mora = moras.last().unwrap(); + if last_mora.pitch != 0.0 { + let mut new_moras: Vec = Vec::with_capacity(moras.len() + 1); + new_moras.extend_from_slice(moras.as_slice()); + let interrogative_mora = make_interrogative_mora(last_mora); + new_moras.push(interrogative_mora); + return new_moras; + } + } + moras.clone() + } + + fn make_interrogative_mora(last_mora: &Mora) -> Mora { + const FIX_VOWEL_LENGTH: f32 = 0.15; + const ADJUST_PITCH: f32 = 0.3; + const MAX_PITCH: f32 = 6.5; + + let pitch = (last_mora.pitch + ADJUST_PITCH).min(MAX_PITCH); + + Mora { + text: mora_to_text(None, &last_mora.vowel), + consonant: None, + consonant_length: None, + vowel: last_mora.vowel.clone(), + vowel_length: FIX_VOWEL_LENGTH, + pitch, + } + } + } +} diff --git a/crates/voicevox_core/src/engine/mod.rs b/crates/voicevox_core/src/engine/mod.rs index 32e304114..0c1576ca3 100644 --- a/crates/voicevox_core/src/engine/mod.rs +++ b/crates/voicevox_core/src/engine/mod.rs @@ -1,16 +1,17 @@ mod acoustic_feature_extractor; mod audio_file; mod full_context_label; +mod interpret_query; mod kana_parser; mod model; mod mora_list; pub(crate) mod open_jtalk; pub(crate) use self::acoustic_feature_extractor::OjtPhoneme; +pub(crate) use self::audio_file::to_s16le_pcm; pub use self::audio_file::wav_from_s16le; -pub(crate) use self::full_context_label::{ - extract_full_context_label, mora_to_text, FullContextLabelError, -}; +pub(crate) use self::full_context_label::{extract_full_context_label, FullContextLabelError}; +pub(crate) use self::interpret_query::{initial_process, split_mora, DecoderFeature}; pub(crate) use self::kana_parser::{create_kana, parse_kana, KanaParseError}; pub use self::model::{AccentPhrase, AudioQuery, Mora}; pub(crate) use self::mora_list::mora2text; diff --git a/crates/voicevox_core/src/lib.rs b/crates/voicevox_core/src/lib.rs index 6aa121cb5..a112cecb3 100644 --- a/crates/voicevox_core/src/lib.rs +++ b/crates/voicevox_core/src/lib.rs @@ -49,6 +49,7 @@ const _: () = { }; mod asyncs; +mod core; mod devices; /// cbindgen:ignore mod engine; diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 07454c9e9..5215d7976 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -1,17 +1,16 @@ use easy_ext::ext; use enum_map::enum_map; -use std::{ - io::{Cursor, Write as _}, - marker::PhantomData, - ops::Range, - sync::Arc, -}; +use std::{marker::PhantomData, ops::Range, sync::Arc}; use tracing::info; use crate::{ asyncs::{Async, BlockingThreadPool, SingleTasked}, + core::{ensure_minimum_phoneme_length, pad_decoder_feature}, devices::{DeviceSpec, GpuSpec}, - engine::{create_kana, mora_to_text, wav_from_s16le, Mora, OjtPhoneme}, + engine::{ + create_kana, initial_process, split_mora, to_s16le_pcm, wav_from_s16le, DecoderFeature, + Mora, OjtPhoneme, + }, error::ErrorRepr, infer::{ self, @@ -331,103 +330,8 @@ trait AsInner { style_id: StyleId, options: &SynthesisOptions, ) -> Result { - let AudioQuery { - accent_phrases, - speed_scale, - pitch_scale, - intonation_scale, - pre_phoneme_length, - post_phoneme_length, - .. - } = audio_query; - - let accent_phrases = if options.enable_interrogative_upspeak { - &adjust_interrogative_accent_phrases(accent_phrases) - } else { - accent_phrases - }; - - let (flatten_moras, phoneme_data_list) = initial_process(accent_phrases); - - let mut phoneme_length_list = vec![*pre_phoneme_length]; - let mut f0_list = vec![0.]; - let mut voiced_list = vec![false]; - { - let mut sum_of_f0_bigger_than_zero = 0.; - let mut count_of_f0_bigger_than_zero = 0; - - for Mora { - consonant_length, - vowel_length, - pitch, - .. - } in flatten_moras - { - if let Some(consonant_length) = consonant_length { - phoneme_length_list.push(consonant_length); - } - phoneme_length_list.push(vowel_length); - - let f0_single = pitch * 2.0_f32.powf(*pitch_scale); - f0_list.push(f0_single); - - let bigger_than_zero = f0_single > 0.; - voiced_list.push(bigger_than_zero); - - if bigger_than_zero { - sum_of_f0_bigger_than_zero += f0_single; - count_of_f0_bigger_than_zero += 1; - } - } - phoneme_length_list.push(*post_phoneme_length); - f0_list.push(0.); - voiced_list.push(false); - let mean_f0 = sum_of_f0_bigger_than_zero / (count_of_f0_bigger_than_zero as f32); - - if !mean_f0.is_nan() { - for i in 0..f0_list.len() { - if voiced_list[i] { - f0_list[i] = (f0_list[i] - mean_f0) * intonation_scale + mean_f0; - } - } - } - } - - let (_, _, vowel_indexes) = split_mora(&phoneme_data_list); - - let mut phoneme = Vec::new(); - let mut f0: Vec = Vec::new(); - { - const RATE: f32 = 24000. / 256.; - let mut sum_of_phoneme_length = 0; - let mut count_of_f0 = 0; - let mut vowel_indexes_index = 0; - - for (i, phoneme_length) in phoneme_length_list.iter().enumerate() { - // VOICEVOX ENGINEと挙動を合わせるため、四捨五入ではなく偶数丸めをする - // - // https://github.com/VOICEVOX/voicevox_engine/issues/552 - let phoneme_length = ((*phoneme_length * RATE).round_ties_even() / speed_scale) - .round_ties_even() as usize; - let phoneme_id = phoneme_data_list[i].phoneme_id(); - - for _ in 0..phoneme_length { - let mut phonemes_vec = [0.; OjtPhoneme::num_phoneme()]; - phonemes_vec[phoneme_id as usize] = 1.; - phoneme.push(phonemes_vec) - } - sum_of_phoneme_length += phoneme_length; - - if i as i64 == vowel_indexes[vowel_indexes_index] { - for _ in 0..sum_of_phoneme_length { - f0.push(f0_list[count_of_f0]); - } - count_of_f0 += 1; - sum_of_phoneme_length = 0; - vowel_indexes_index += 1; - } - } - } + let DecoderFeature { f0, phoneme } = + audio_query.decoder_feature(options.enable_interrogative_upspeak); let spec = self .generate_full_intermediate( @@ -438,62 +342,13 @@ trait AsInner { style_id, ) .await?; - return Ok(AudioFeature { + Ok(AudioFeature { internal_state: spec, style_id, frame_length: f0.len(), frame_rate: (DEFAULT_SAMPLING_RATE as f64) / 256.0, audio_query: audio_query.clone(), - }); - - fn adjust_interrogative_accent_phrases( - accent_phrases: &[AccentPhrase], - ) -> Vec { - accent_phrases - .iter() - .map(|accent_phrase| AccentPhrase { - moras: adjust_interrogative_moras(accent_phrase), - ..accent_phrase.clone() - }) - .collect() - } - - fn adjust_interrogative_moras( - AccentPhrase { - moras, - is_interrogative, - .. - }: &AccentPhrase, - ) -> Vec { - if *is_interrogative && !moras.is_empty() { - let last_mora = moras.last().unwrap(); - if last_mora.pitch != 0.0 { - let mut new_moras: Vec = Vec::with_capacity(moras.len() + 1); - new_moras.extend_from_slice(moras.as_slice()); - let interrogative_mora = make_interrogative_mora(last_mora); - new_moras.push(interrogative_mora); - return new_moras; - } - } - moras.clone() - } - - fn make_interrogative_mora(last_mora: &Mora) -> Mora { - const FIX_VOWEL_LENGTH: f32 = 0.15; - const ADJUST_PITCH: f32 = 0.3; - const MAX_PITCH: f32 = 6.5; - - let pitch = (last_mora.pitch + ADJUST_PITCH).min(MAX_PITCH); - - Mora { - text: mora_to_text(None, &last_mora.vowel), - consonant: None, - consonant_length: None, - vowel: last_mora.vowel.clone(), - vowel_length: FIX_VOWEL_LENGTH, - pitch, - } - } + }) } async fn render(&self, audio: &AudioFeature, range: Range) -> Result> { @@ -508,38 +363,11 @@ trait AsInner { .render_audio_segment(spec_segment.to_owned(), audio.style_id) .await?; let wave = trim_margin_from_wave(wave_with_margin); - return Ok(to_s16le_pcm( + Ok(to_s16le_pcm::( wave.as_slice() .expect("`trim_margin_from_wave` should just trim an array"), &audio.audio_query, - )); - - fn to_s16le_pcm( - wave: &[f32], - &AudioQuery { - volume_scale, - output_sampling_rate, - output_stereo, - .. - }: &AudioQuery, - ) -> Vec { - let num_channels: u16 = if output_stereo { 2 } else { 1 }; - let repeat_count: u32 = - (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32; - let bytes_size = wave.len() as u32 * repeat_count * 2; - let buf: Vec = Vec::with_capacity(bytes_size as usize); - let mut cur = Cursor::new(buf); - - for value in wave { - let v = (value * volume_scale).clamp(-1., 1.); - let data = (v * 0x7fff as f32) as i16; - for _ in 0..repeat_count { - cur.write_all(&data.to_le_bytes()).unwrap(); - } - } - - cur.into_inner() - } + )) } async fn synthesis( @@ -957,17 +785,7 @@ impl Status { }, ) .await?; - let mut output = output.into_raw_vec(); - - for output_item in output.iter_mut() { - if *output_item < PHONEME_LENGTH_MINIMAL { - *output_item = PHONEME_LENGTH_MINIMAL; - } - } - - return Ok(output); - - const PHONEME_LENGTH_MINIMAL: f32 = 0.01; + Ok(ensure_minimum_phoneme_length(output.into_raw_vec())) } #[expect( @@ -1020,15 +838,11 @@ impl Status { ) -> Result> { let (model_id, inner_voice_id) = self.ids_for::(style_id)?; - // 音が途切れてしまうのを避けるworkaround処理が入っている - // TODO: 改善したらここのpadding処理を取り除く - let start_and_end_padding_size = 2 * PADDING_FRAME_LENGTH; - let length_with_padding = f0.len() + start_and_end_padding_size; - let f0_with_padding = make_f0_with_padding(f0, PADDING_FRAME_LENGTH); - let phoneme_with_padding = make_phoneme_with_padding( - phoneme_vector.into_shape([length, phoneme_size]).unwrap(), - PADDING_FRAME_LENGTH, - ); + let (length_with_padding, f0_with_padding, phoneme_with_padding) = + pad_decoder_feature::( + f0, + phoneme_vector.into_shape([length, phoneme_size]).unwrap(), + ); let GenerateFullIntermediateOutput { spec: spec_with_padding, @@ -1051,36 +865,13 @@ impl Status { unreachable!("Validation error: Too short padding for input, please report this issue on GitHub."); } // マージン分を両端に残して音声特徴量を返す - return Ok(spec_with_padding + Ok(spec_with_padding .slice(ndarray::s![ PADDING_FRAME_LENGTH - MARGIN ..spec_with_padding.nrows() - PADDING_FRAME_LENGTH + MARGIN, .. ]) - .to_owned()); - - fn make_f0_with_padding( - f0_slice: ndarray::Array1, - padding_size: usize, - ) -> ndarray::Array1 { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let padding = ndarray::Array1::::zeros(padding_size); - ndarray::concatenate![ndarray::Axis(0), padding, f0_slice, padding] - } - - fn make_phoneme_with_padding( - phoneme_slice: ndarray::Array2, - padding_size: usize, - ) -> ndarray::Array2 { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let mut padding = ndarray::Array2::::zeros((padding_size, phoneme_slice.ncols())); - padding - .slice_mut(ndarray::s![.., 0]) - .assign(&ndarray::arr0(1.0)); - ndarray::concatenate![ndarray::Axis(0), padding, phoneme_slice, padding] - } + .to_owned()) } /// 与えられた音声特徴量で音声生成。 @@ -1254,87 +1045,6 @@ fn list_windows_video_cards() { } } -fn initial_process(accent_phrases: &[AccentPhrase]) -> (Vec, Vec) { - let flatten_moras = to_flatten_moras(accent_phrases); - - let mut phoneme_strings = vec!["pau".to_string()]; - for mora in flatten_moras.iter() { - if let Some(consonant) = &mora.consonant { - phoneme_strings.push(consonant.clone()) - } - phoneme_strings.push(mora.vowel.clone()); - } - phoneme_strings.push("pau".to_string()); - - let phoneme_data_list = to_phoneme_data_list(&phoneme_strings); - - return (flatten_moras, phoneme_data_list); - - fn to_flatten_moras(accent_phrases: &[AccentPhrase]) -> Vec { - let mut flatten_moras = Vec::new(); - - for AccentPhrase { - moras, pause_mora, .. - } in accent_phrases - { - for mora in moras { - flatten_moras.push(mora.clone()); - } - if let Some(pause_mora) = pause_mora { - flatten_moras.push(pause_mora.clone()); - } - } - - flatten_moras - } - - fn to_phoneme_data_list>(phoneme_str_list: &[T]) -> Vec { - OjtPhoneme::convert( - phoneme_str_list - .iter() - .map(AsRef::as_ref) - .map(ToOwned::to_owned) - .map(OjtPhoneme::new) - .collect::>() - .as_slice(), - ) - } -} - -fn split_mora(phoneme_list: &[OjtPhoneme]) -> (Vec, Vec, Vec) { - let mut vowel_indexes = Vec::new(); - for (i, phoneme) in phoneme_list.iter().enumerate() { - const MORA_PHONEME_LIST: &[&str] = &[ - "a", "i", "u", "e", "o", "N", "A", "I", "U", "E", "O", "cl", "pau", - ]; - - if MORA_PHONEME_LIST - .iter() - .any(|mora_phoneme| *mora_phoneme == phoneme.phoneme()) - { - vowel_indexes.push(i as i64); - } - } - - let vowel_phoneme_list = vowel_indexes - .iter() - .map(|vowel_index| phoneme_list[*vowel_index as usize].clone()) - .collect(); - - let mut consonant_phoneme_list = vec![OjtPhoneme::default()]; - for i in 0..(vowel_indexes.len() - 1) { - let prev = vowel_indexes[i]; - let next = vowel_indexes[i + 1]; - if next - prev == 1 { - consonant_phoneme_list.push(OjtPhoneme::default()); - } else { - consonant_phoneme_list.push(phoneme_list[next as usize - 1].clone()); - } - } - - (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes) -} - impl AudioQuery { fn from_accent_phrases(accent_phrases: Vec) -> Self { let kana = create_kana(&accent_phrases);