From 203410c26e867d6f4f2efa85a5d6086d3612d588 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Fri, 20 Dec 2024 22:10:19 +0900 Subject: [PATCH] =?UTF-8?q?refactor:=20`mod=20inner`=E3=82=92=E5=89=8A?= =?UTF-8?q?=E9=99=A4=20(#897)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #865 で入れた`mod inner`を消す。 Refs: https://github.com/VOICEVOX/voicevox_core/pull/865/files#r1823086190 --- crates/voicevox_core/src/synthesizer.rs | 2114 ++++++++--------- .../voicevox_core_c_api/tests/e2e/log_mask.rs | 2 +- .../tests/e2e/snapshots.toml | 20 +- 3 files changed, 1053 insertions(+), 1083 deletions(-) diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 7437a3ca5..e9240de86 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -1,9 +1,36 @@ -use crate::{ - asyncs::{BlockingThreadPool, SingleTasked}, - infer, +use easy_ext::ext; +use enum_map::enum_map; +use std::{ + io::{Cursor, Write as _}, + marker::PhantomData, + ops::Range, + sync::Arc, }; +use tracing::info; -pub use self::inner::MARGIN; +use crate::{ + asyncs::{Async, BlockingThreadPool, SingleTasked}, + devices::{DeviceSpec, GpuSpec}, + engine::{create_kana, mora_to_text, wav_from_s16le, Mora, OjtPhoneme}, + error::ErrorRepr, + infer::{ + self, + domains::{ + FrameDecodeDomain, FrameDecodeOperation, GenerateFullIntermediateInput, + GenerateFullIntermediateOutput, InferenceDomainMap, PredictDurationInput, + PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, + PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput, PredictSingF0Input, + PredictSingF0Output, PredictSingVolumeInput, PredictSingVolumeOutput, + RenderAudioSegmentInput, RenderAudioSegmentOutput, SfDecodeInput, SfDecodeOutput, + SingingTeacherDomain, SingingTeacherOperation, TalkDomain, TalkOperation, + }, + InferenceRuntime, InferenceSessionOptions, + }, + status::Status, + text_analyzer::{KanaAnalyzer, OpenJTalkAnalyzer, TextAnalyzer}, + voice_model, AccentPhrase, AudioQuery, FullcontextExtractor, Result, StyleId, VoiceModelId, + VoiceModelMeta, +}; /// [`blocking::Synthesizer::synthesis`]および[`nonblocking::Synthesizer::synthesis`]のオプション。 /// @@ -100,1211 +127,1154 @@ impl AsyncExt for BlockingThreadPool { } } -mod inner { - use easy_ext::ext; - use enum_map::enum_map; - use std::{ - io::{Cursor, Write as _}, - marker::PhantomData, - ops::Range, - sync::Arc, - }; - use tracing::info; +const DEFAULT_SAMPLING_RATE: u32 = 24000; +/// 音が途切れてしまうのを避けるworkaround処理のためのパディング幅(フレーム数) +const PADDING_FRAME_LENGTH: usize = 38; // (0.4秒 * 24000Hz / 256.0).round() +/// 音声生成の際、音声特徴量の前後に確保すべきマージン幅(フレーム数) +/// モデルの受容野から計算される +pub const MARGIN: usize = 14; +/// 指定した音声区間に対応する特徴量を両端にマージンを追加した上で切り出す +fn crop_with_margin(audio: &AudioFeature, range: Range) -> ndarray::ArrayView2<'_, f32> { + if range.start > audio.frame_length || range.end > audio.frame_length { + panic!( + "{range:?} is out of range for audio feature of length {frame_length}", + frame_length = audio.frame_length, + ); + } + if range.start > range.end { + panic!("{range:?} is invalid because start > end",); + } + let range = range.start..range.end + 2 * MARGIN; + audio.internal_state.slice(ndarray::s![range, ..]) +} +/// 追加した安全マージンを生成音声から取り除く +fn trim_margin_from_wave(wave_with_margin: ndarray::Array1) -> ndarray::Array1 { + let len = wave_with_margin.len(); + wave_with_margin.slice_move(ndarray::s![MARGIN * 256..len - MARGIN * 256]) +} - use crate::{ - asyncs::{Async, BlockingThreadPool, SingleTasked}, - devices::{DeviceSpec, GpuSpec}, - engine::{create_kana, mora_to_text, wav_from_s16le, Mora, OjtPhoneme}, - error::ErrorRepr, - infer::{ - self, - domains::{ - FrameDecodeDomain, FrameDecodeOperation, GenerateFullIntermediateInput, - GenerateFullIntermediateOutput, InferenceDomainMap, PredictDurationInput, - PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, - PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput, - PredictSingF0Input, PredictSingF0Output, PredictSingVolumeInput, - PredictSingVolumeOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput, - SfDecodeInput, SfDecodeOutput, SingingTeacherDomain, SingingTeacherOperation, - TalkDomain, TalkOperation, - }, - InferenceRuntime, InferenceSessionOptions, - }, - status::Status, - text_analyzer::{KanaAnalyzer, OpenJTalkAnalyzer, TextAnalyzer}, - voice_model, AccentPhrase, AudioQuery, FullcontextExtractor, Result, StyleId, - SynthesisOptions, VoiceModelId, VoiceModelMeta, - }; +/// 音声の中間表現。 +pub struct AudioFeature { + /// (フレーム数, 特徴数)の形を持つ音声特徴量。 + internal_state: ndarray::Array2, + /// 生成時に指定したスタイル番号。 + style_id: crate::StyleId, + /// workaround paddingを除いた音声特徴量のフレーム数。 + pub frame_length: usize, + /// フレームレート。全体の秒数は`frame_length / frame_rate`で表せる。 + pub frame_rate: f64, + /// 生成時に利用したクエリ。 + audio_query: AudioQuery, +} - use super::{AccelerationMode, AsyncExt, InitializeOptions, TtsOptions}; - - const DEFAULT_SAMPLING_RATE: u32 = 24000; - /// 音が途切れてしまうのを避けるworkaround処理のためのパディング幅(フレーム数) - const PADDING_FRAME_LENGTH: usize = 38; // (0.4秒 * 24000Hz / 256.0).round() - /// 音声生成の際、音声特徴量の前後に確保すべきマージン幅(フレーム数) - /// モデルの受容野から計算される - pub const MARGIN: usize = 14; - /// 指定した音声区間に対応する特徴量を両端にマージンを追加した上で切り出す - fn crop_with_margin(audio: &AudioFeature, range: Range) -> ndarray::ArrayView2<'_, f32> { - if range.start > audio.frame_length || range.end > audio.frame_length { - panic!( - "{range:?} is out of range for audio feature of length {frame_length}", - frame_length = audio.frame_length, - ); - } - if range.start > range.end { - panic!("{range:?} is invalid because start > end",); - } - let range = range.start..range.end + 2 * MARGIN; - audio.internal_state.slice(ndarray::s![range, ..]) - } - /// 追加した安全マージンを生成音声から取り除く - fn trim_margin_from_wave(wave_with_margin: ndarray::Array1) -> ndarray::Array1 { - let len = wave_with_margin.len(); - wave_with_margin.slice_move(ndarray::s![MARGIN * 256..len - MARGIN * 256]) - } - - /// 音声の中間表現。 - pub struct AudioFeature { - /// (フレーム数, 特徴数)の形を持つ音声特徴量。 - internal_state: ndarray::Array2, - /// 生成時に指定したスタイル番号。 - style_id: crate::StyleId, - /// workaround paddingを除いた音声特徴量のフレーム数。 - pub frame_length: usize, - /// フレームレート。全体の秒数は`frame_length / frame_rate`で表せる。 - pub frame_rate: f64, - /// 生成時に利用したクエリ。 - audio_query: AudioQuery, - } - - pub struct Inner { - pub(super) status: Arc>, - open_jtalk_analyzer: OpenJTalkAnalyzer, - kana_analyzer: KanaAnalyzer, - use_gpu: bool, - _marker: PhantomData A>, - } - - impl From> for Inner { - fn from(from: Inner) -> Self { - Self { - status: from.status, - open_jtalk_analyzer: from.open_jtalk_analyzer, - kana_analyzer: KanaAnalyzer, - use_gpu: from.use_gpu, - _marker: PhantomData, - } +struct Inner { + status: Arc>, + open_jtalk_analyzer: OpenJTalkAnalyzer, + kana_analyzer: KanaAnalyzer, + use_gpu: bool, + _marker: PhantomData A>, +} + +impl From> for Inner { + fn from(from: Inner) -> Self { + Self { + status: from.status, + open_jtalk_analyzer: from.open_jtalk_analyzer, + kana_analyzer: KanaAnalyzer, + use_gpu: from.use_gpu, + _marker: PhantomData, } } +} - impl Inner { - pub(super) fn new( - onnxruntime: &'static crate::blocking::Onnxruntime, - open_jtalk: O, - options: &InitializeOptions, - ) -> Result { - #[cfg(windows)] - list_windows_video_cards(); - - let test_gpus = || { - info!("GPUをテストします:"); - let availabilities = crate::devices::test_gpus( - GpuSpec::defaults(), - crate::blocking::Onnxruntime::DISPLAY_NAME, - onnxruntime.supported_devices()?, - |gpu| onnxruntime.test_gpu(gpu), - ); - for line in availabilities.to_string().lines() { - info!(" {line}"); - } - crate::Result::Ok(availabilities) - }; +impl Inner { + fn new( + onnxruntime: &'static crate::blocking::Onnxruntime, + open_jtalk: O, + options: &InitializeOptions, + ) -> Result { + #[cfg(windows)] + list_windows_video_cards(); + + let test_gpus = || { + info!("GPUをテストします:"); + let availabilities = crate::devices::test_gpus( + GpuSpec::defaults(), + crate::blocking::Onnxruntime::DISPLAY_NAME, + onnxruntime.supported_devices()?, + |gpu| onnxruntime.test_gpu(gpu), + ); + for line in availabilities.to_string().lines() { + info!(" {line}"); + } + crate::Result::Ok(availabilities) + }; - let device_for_heavy = match options.acceleration_mode { - AccelerationMode::Auto => match *test_gpus()?.oks() { - [] => DeviceSpec::Cpu, + let device_for_heavy = match options.acceleration_mode { + AccelerationMode::Auto => match *test_gpus()?.oks() { + [] => DeviceSpec::Cpu, + [gpu, ..] => DeviceSpec::Gpu(gpu), + }, + AccelerationMode::Cpu => DeviceSpec::Cpu, + AccelerationMode::Gpu => { + let availabilities = test_gpus()?; + match *availabilities.oks() { + [] => return Err(ErrorRepr::GpuSupport(availabilities).into()), [gpu, ..] => DeviceSpec::Gpu(gpu), - }, - AccelerationMode::Cpu => DeviceSpec::Cpu, - AccelerationMode::Gpu => { - let availabilities = test_gpus()?; - match *availabilities.oks() { - [] => return Err(ErrorRepr::GpuSupport(availabilities).into()), - [gpu, ..] => DeviceSpec::Gpu(gpu), - } } - }; - - info!("{device_for_heavy}を利用します"); - - // 軽いモデルはこちらを使う - let light_session_options = - InferenceSessionOptions::new(options.cpu_num_threads, DeviceSpec::Cpu); - - // 重いモデルはこちらを使う - let heavy_session_options = - InferenceSessionOptions::new(options.cpu_num_threads, device_for_heavy); - - let status = Status::new( - onnxruntime, - InferenceDomainMap { - talk: enum_map! { - TalkOperation::PredictDuration - | TalkOperation::PredictIntonation - | TalkOperation::GenerateFullIntermediate => light_session_options, - TalkOperation::RenderAudioSegment => heavy_session_options, - }, - singing_teacher: enum_map! { - SingingTeacherOperation::PredictSingConsonantLength - | SingingTeacherOperation::PredictSingF0 - | SingingTeacherOperation::PredictSingVolume => light_session_options, - }, - frame_decode: enum_map! { - FrameDecodeOperation::SfDecode => heavy_session_options, - }, + } + }; + + info!("{device_for_heavy}を利用します"); + + // 軽いモデルはこちらを使う + let light_session_options = + InferenceSessionOptions::new(options.cpu_num_threads, DeviceSpec::Cpu); + + // 重いモデルはこちらを使う + let heavy_session_options = + InferenceSessionOptions::new(options.cpu_num_threads, device_for_heavy); + + let status = Status::new( + onnxruntime, + InferenceDomainMap { + talk: enum_map! { + TalkOperation::PredictDuration + | TalkOperation::PredictIntonation + | TalkOperation::GenerateFullIntermediate => light_session_options, + TalkOperation::RenderAudioSegment => heavy_session_options, }, - ) - .into(); + singing_teacher: enum_map! { + SingingTeacherOperation::PredictSingConsonantLength + | SingingTeacherOperation::PredictSingF0 + | SingingTeacherOperation::PredictSingVolume => light_session_options, + }, + frame_decode: enum_map! { + FrameDecodeOperation::SfDecode => heavy_session_options, + }, + }, + ) + .into(); - let use_gpu = matches!(device_for_heavy, DeviceSpec::Gpu(_)); + let use_gpu = matches!(device_for_heavy, DeviceSpec::Gpu(_)); - Ok(Self { - status, - open_jtalk_analyzer: OpenJTalkAnalyzer::new(open_jtalk), - kana_analyzer: KanaAnalyzer, - use_gpu, - _marker: PhantomData, - }) - } + Ok(Self { + status, + open_jtalk_analyzer: OpenJTalkAnalyzer::new(open_jtalk), + kana_analyzer: KanaAnalyzer, + use_gpu, + _marker: PhantomData, + }) + } - pub(super) fn onnxruntime(&self) -> &'static crate::blocking::Onnxruntime { - self.status.rt - } + fn onnxruntime(&self) -> &'static crate::blocking::Onnxruntime { + self.status.rt + } - pub(super) fn is_gpu_mode(&self) -> bool { - self.use_gpu - } + fn is_gpu_mode(&self) -> bool { + self.use_gpu + } - pub(super) async fn load_voice_model( - &self, - model: &voice_model::Inner, - ) -> crate::Result<()> { - let model_bytes = model.read_inference_models().await?; + async fn load_voice_model(&self, model: &voice_model::Inner) -> crate::Result<()> { + let model_bytes = model.read_inference_models().await?; - let status = self.status.clone(); - let header = model.header().clone(); - A::unblock(move || status.insert_model(&header, &model_bytes)).await - } + let status = self.status.clone(); + let header = model.header().clone(); + A::unblock(move || status.insert_model(&header, &model_bytes)).await + } - pub(super) fn unload_voice_model(&self, voice_model_id: VoiceModelId) -> Result<()> { - self.status.unload_model(voice_model_id) - } + fn unload_voice_model(&self, voice_model_id: VoiceModelId) -> Result<()> { + self.status.unload_model(voice_model_id) + } - pub(super) fn is_loaded_voice_model(&self, voice_model_id: VoiceModelId) -> bool { - self.status.is_loaded_model(voice_model_id) - } + fn is_loaded_voice_model(&self, voice_model_id: VoiceModelId) -> bool { + self.status.is_loaded_model(voice_model_id) + } - pub(super) fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { - self.status.is_loaded_model_by_style_id(style_id) - } + fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { + self.status.is_loaded_model_by_style_id(style_id) + } - pub(super) fn metas(&self) -> VoiceModelMeta { - self.status.metas() - } + fn metas(&self) -> VoiceModelMeta { + self.status.metas() + } - pub(super) async fn precompute_render( - &self, - audio_query: &AudioQuery, - style_id: StyleId, - options: &SynthesisOptions, - ) -> Result { - let AudioQuery { - accent_phrases, - speed_scale, - pitch_scale, - intonation_scale, - pre_phoneme_length, - post_phoneme_length, - .. - } = audio_query; + async fn precompute_render( + &self, + audio_query: &AudioQuery, + style_id: StyleId, + options: &SynthesisOptions, + ) -> Result { + let AudioQuery { + accent_phrases, + speed_scale, + pitch_scale, + intonation_scale, + pre_phoneme_length, + post_phoneme_length, + .. + } = audio_query; + + let accent_phrases = if options.enable_interrogative_upspeak { + &adjust_interrogative_accent_phrases(accent_phrases) + } else { + accent_phrases + }; - let accent_phrases = if options.enable_interrogative_upspeak { - &adjust_interrogative_accent_phrases(accent_phrases) - } else { - accent_phrases - }; + let (flatten_moras, phoneme_data_list) = initial_process(accent_phrases); - let (flatten_moras, phoneme_data_list) = initial_process(accent_phrases); + let mut phoneme_length_list = vec![*pre_phoneme_length]; + let mut f0_list = vec![0.]; + let mut voiced_list = vec![false]; + { + let mut sum_of_f0_bigger_than_zero = 0.; + let mut count_of_f0_bigger_than_zero = 0; - let mut phoneme_length_list = vec![*pre_phoneme_length]; - let mut f0_list = vec![0.]; - let mut voiced_list = vec![false]; + for Mora { + consonant_length, + vowel_length, + pitch, + .. + } in flatten_moras { - let mut sum_of_f0_bigger_than_zero = 0.; - let mut count_of_f0_bigger_than_zero = 0; - - for Mora { - consonant_length, - vowel_length, - pitch, - .. - } in flatten_moras - { - if let Some(consonant_length) = consonant_length { - phoneme_length_list.push(consonant_length); - } - phoneme_length_list.push(vowel_length); + if let Some(consonant_length) = consonant_length { + phoneme_length_list.push(consonant_length); + } + phoneme_length_list.push(vowel_length); - let f0_single = pitch * 2.0_f32.powf(*pitch_scale); - f0_list.push(f0_single); + let f0_single = pitch * 2.0_f32.powf(*pitch_scale); + f0_list.push(f0_single); - let bigger_than_zero = f0_single > 0.; - voiced_list.push(bigger_than_zero); + let bigger_than_zero = f0_single > 0.; + voiced_list.push(bigger_than_zero); - if bigger_than_zero { - sum_of_f0_bigger_than_zero += f0_single; - count_of_f0_bigger_than_zero += 1; - } + if bigger_than_zero { + sum_of_f0_bigger_than_zero += f0_single; + count_of_f0_bigger_than_zero += 1; } - phoneme_length_list.push(*post_phoneme_length); - f0_list.push(0.); - voiced_list.push(false); - let mean_f0 = sum_of_f0_bigger_than_zero / (count_of_f0_bigger_than_zero as f32); - - if !mean_f0.is_nan() { - for i in 0..f0_list.len() { - if voiced_list[i] { - f0_list[i] = (f0_list[i] - mean_f0) * intonation_scale + mean_f0; - } + } + phoneme_length_list.push(*post_phoneme_length); + f0_list.push(0.); + voiced_list.push(false); + let mean_f0 = sum_of_f0_bigger_than_zero / (count_of_f0_bigger_than_zero as f32); + + if !mean_f0.is_nan() { + for i in 0..f0_list.len() { + if voiced_list[i] { + f0_list[i] = (f0_list[i] - mean_f0) * intonation_scale + mean_f0; } } } + } - let (_, _, vowel_indexes) = split_mora(&phoneme_data_list); + let (_, _, vowel_indexes) = split_mora(&phoneme_data_list); + + let mut phoneme = Vec::new(); + let mut f0: Vec = Vec::new(); + { + const RATE: f32 = 24000. / 256.; + let mut sum_of_phoneme_length = 0; + let mut count_of_f0 = 0; + let mut vowel_indexes_index = 0; + + for (i, phoneme_length) in phoneme_length_list.iter().enumerate() { + // VOICEVOX ENGINEと挙動を合わせるため、四捨五入ではなく偶数丸めをする + // + // https://github.com/VOICEVOX/voicevox_engine/issues/552 + let phoneme_length = ((*phoneme_length * RATE).round_ties_even() / speed_scale) + .round_ties_even() as usize; + let phoneme_id = phoneme_data_list[i].phoneme_id(); + + for _ in 0..phoneme_length { + let mut phonemes_vec = [0.; OjtPhoneme::num_phoneme()]; + phonemes_vec[phoneme_id as usize] = 1.; + phoneme.push(phonemes_vec) + } + sum_of_phoneme_length += phoneme_length; - let mut phoneme = Vec::new(); - let mut f0: Vec = Vec::new(); - { - const RATE: f32 = 24000. / 256.; - let mut sum_of_phoneme_length = 0; - let mut count_of_f0 = 0; - let mut vowel_indexes_index = 0; - - for (i, phoneme_length) in phoneme_length_list.iter().enumerate() { - // VOICEVOX ENGINEと挙動を合わせるため、四捨五入ではなく偶数丸めをする - // - // https://github.com/VOICEVOX/voicevox_engine/issues/552 - let phoneme_length = ((*phoneme_length * RATE).round_ties_even() / speed_scale) - .round_ties_even() as usize; - let phoneme_id = phoneme_data_list[i].phoneme_id(); - - for _ in 0..phoneme_length { - let mut phonemes_vec = [0.; OjtPhoneme::num_phoneme()]; - phonemes_vec[phoneme_id as usize] = 1.; - phoneme.push(phonemes_vec) - } - sum_of_phoneme_length += phoneme_length; - - if i as i64 == vowel_indexes[vowel_indexes_index] { - for _ in 0..sum_of_phoneme_length { - f0.push(f0_list[count_of_f0]); - } - count_of_f0 += 1; - sum_of_phoneme_length = 0; - vowel_indexes_index += 1; + if i as i64 == vowel_indexes[vowel_indexes_index] { + for _ in 0..sum_of_phoneme_length { + f0.push(f0_list[count_of_f0]); } + count_of_f0 += 1; + sum_of_phoneme_length = 0; + vowel_indexes_index += 1; } } + } - let spec = self - .generate_full_intermediate( - f0.len(), - OjtPhoneme::num_phoneme(), - &f0, - phoneme.as_flattened(), - style_id, - ) - .await?; - return Ok(AudioFeature { - internal_state: spec, + let spec = self + .generate_full_intermediate( + f0.len(), + OjtPhoneme::num_phoneme(), + &f0, + phoneme.as_flattened(), style_id, - frame_length: f0.len(), - frame_rate: (DEFAULT_SAMPLING_RATE as f64) / 256.0, - audio_query: audio_query.clone(), - }); - - fn adjust_interrogative_accent_phrases( - accent_phrases: &[AccentPhrase], - ) -> Vec { - accent_phrases - .iter() - .map(|accent_phrase| AccentPhrase { - moras: adjust_interrogative_moras(accent_phrase), - ..accent_phrase.clone() - }) - .collect() - } + ) + .await?; + return Ok(AudioFeature { + internal_state: spec, + style_id, + frame_length: f0.len(), + frame_rate: (DEFAULT_SAMPLING_RATE as f64) / 256.0, + audio_query: audio_query.clone(), + }); + + fn adjust_interrogative_accent_phrases( + accent_phrases: &[AccentPhrase], + ) -> Vec { + accent_phrases + .iter() + .map(|accent_phrase| AccentPhrase { + moras: adjust_interrogative_moras(accent_phrase), + ..accent_phrase.clone() + }) + .collect() + } - fn adjust_interrogative_moras( - AccentPhrase { - moras, - is_interrogative, - .. - }: &AccentPhrase, - ) -> Vec { - if *is_interrogative && !moras.is_empty() { - let last_mora = moras.last().unwrap(); - if last_mora.pitch != 0.0 { - let mut new_moras: Vec = Vec::with_capacity(moras.len() + 1); - new_moras.extend_from_slice(moras.as_slice()); - let interrogative_mora = make_interrogative_mora(last_mora); - new_moras.push(interrogative_mora); - return new_moras; - } + fn adjust_interrogative_moras( + AccentPhrase { + moras, + is_interrogative, + .. + }: &AccentPhrase, + ) -> Vec { + if *is_interrogative && !moras.is_empty() { + let last_mora = moras.last().unwrap(); + if last_mora.pitch != 0.0 { + let mut new_moras: Vec = Vec::with_capacity(moras.len() + 1); + new_moras.extend_from_slice(moras.as_slice()); + let interrogative_mora = make_interrogative_mora(last_mora); + new_moras.push(interrogative_mora); + return new_moras; } - moras.clone() } + moras.clone() + } - fn make_interrogative_mora(last_mora: &Mora) -> Mora { - const FIX_VOWEL_LENGTH: f32 = 0.15; - const ADJUST_PITCH: f32 = 0.3; - const MAX_PITCH: f32 = 6.5; + fn make_interrogative_mora(last_mora: &Mora) -> Mora { + const FIX_VOWEL_LENGTH: f32 = 0.15; + const ADJUST_PITCH: f32 = 0.3; + const MAX_PITCH: f32 = 6.5; - let pitch = (last_mora.pitch + ADJUST_PITCH).min(MAX_PITCH); + let pitch = (last_mora.pitch + ADJUST_PITCH).min(MAX_PITCH); - Mora { - text: mora_to_text(None, &last_mora.vowel), - consonant: None, - consonant_length: None, - vowel: last_mora.vowel.clone(), - vowel_length: FIX_VOWEL_LENGTH, - pitch, - } + Mora { + text: mora_to_text(None, &last_mora.vowel), + consonant: None, + consonant_length: None, + vowel: last_mora.vowel.clone(), + vowel_length: FIX_VOWEL_LENGTH, + pitch, } } + } - pub(super) async fn render( - &self, - audio: &AudioFeature, - range: Range, - ) -> Result> { - // TODO: 44.1kHzなどの対応 - if range.is_empty() { - // FIXME: `start>end`に対してパニックせずに正常に空を返してしまうのでは? - // 指定区間が空のときは早期リターン - return Ok(vec![]); - } - let spec_segment = crop_with_margin(audio, range); - let wave_with_margin = self - .render_audio_segment(spec_segment.to_owned(), audio.style_id) - .await?; - let wave = trim_margin_from_wave(wave_with_margin); - return Ok(to_s16le_pcm( - wave.as_slice() - .expect("`trim_margin_from_wave` should just trim an array"), - &audio.audio_query, - )); - - fn to_s16le_pcm( - wave: &[f32], - &AudioQuery { - volume_scale, - output_sampling_rate, - output_stereo, - .. - }: &AudioQuery, - ) -> Vec { - let num_channels: u16 = if output_stereo { 2 } else { 1 }; - let repeat_count: u32 = - (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32; - let bytes_size = wave.len() as u32 * repeat_count * 2; - let buf: Vec = Vec::with_capacity(bytes_size as usize); - let mut cur = Cursor::new(buf); - - for value in wave { - let v = (value * volume_scale).clamp(-1., 1.); - let data = (v * 0x7fff as f32) as i16; - for _ in 0..repeat_count { - cur.write_all(&data.to_le_bytes()).unwrap(); - } + async fn render(&self, audio: &AudioFeature, range: Range) -> Result> { + // TODO: 44.1kHzなどの対応 + if range.is_empty() { + // FIXME: `start>end`に対してパニックせずに正常に空を返してしまうのでは? + // 指定区間が空のときは早期リターン + return Ok(vec![]); + } + let spec_segment = crop_with_margin(audio, range); + let wave_with_margin = self + .render_audio_segment(spec_segment.to_owned(), audio.style_id) + .await?; + let wave = trim_margin_from_wave(wave_with_margin); + return Ok(to_s16le_pcm( + wave.as_slice() + .expect("`trim_margin_from_wave` should just trim an array"), + &audio.audio_query, + )); + + fn to_s16le_pcm( + wave: &[f32], + &AudioQuery { + volume_scale, + output_sampling_rate, + output_stereo, + .. + }: &AudioQuery, + ) -> Vec { + let num_channels: u16 = if output_stereo { 2 } else { 1 }; + let repeat_count: u32 = + (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32; + let bytes_size = wave.len() as u32 * repeat_count * 2; + let buf: Vec = Vec::with_capacity(bytes_size as usize); + let mut cur = Cursor::new(buf); + + for value in wave { + let v = (value * volume_scale).clamp(-1., 1.); + let data = (v * 0x7fff as f32) as i16; + for _ in 0..repeat_count { + cur.write_all(&data.to_le_bytes()).unwrap(); } - - cur.into_inner() } - } - pub(super) async fn synthesis( - &self, - audio_query: &AudioQuery, - style_id: StyleId, - options: &SynthesisOptions, - ) -> Result> { - let audio = self - .precompute_render(audio_query, style_id, options) - .await?; - let pcm = self.render(&audio, 0..audio.frame_length).await?; - Ok(wav_from_s16le( - &pcm, - audio_query.output_sampling_rate, - audio_query.output_stereo, - )) + cur.into_inner() } + } - pub(super) async fn create_accent_phrases_from_kana( - &self, - kana: &str, - style_id: StyleId, - ) -> Result> { - let accent_phrases = self.kana_analyzer.analyze(kana)?; - self.replace_mora_data(&accent_phrases, style_id).await - } + async fn synthesis( + &self, + audio_query: &AudioQuery, + style_id: StyleId, + options: &SynthesisOptions, + ) -> Result> { + let audio = self + .precompute_render(audio_query, style_id, options) + .await?; + let pcm = self.render(&audio, 0..audio.frame_length).await?; + Ok(wav_from_s16le( + &pcm, + audio_query.output_sampling_rate, + audio_query.output_stereo, + )) + } - pub(super) async fn replace_mora_data( - &self, - accent_phrases: &[AccentPhrase], - style_id: StyleId, - ) -> Result> { - let accent_phrases = self - .replace_phoneme_length(accent_phrases, style_id) - .await?; - self.replace_mora_pitch(&accent_phrases, style_id).await - } + async fn create_accent_phrases_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> Result> { + let accent_phrases = self.kana_analyzer.analyze(kana)?; + self.replace_mora_data(&accent_phrases, style_id).await + } - pub(super) async fn replace_phoneme_length( - &self, - accent_phrases: &[AccentPhrase], - style_id: StyleId, - ) -> Result> { - let (_, phoneme_data_list) = initial_process(accent_phrases); + async fn replace_mora_data( + &self, + accent_phrases: &[AccentPhrase], + style_id: StyleId, + ) -> Result> { + let accent_phrases = self + .replace_phoneme_length(accent_phrases, style_id) + .await?; + self.replace_mora_pitch(&accent_phrases, style_id).await + } - let (_, _, vowel_indexes_data) = split_mora(&phoneme_data_list); + async fn replace_phoneme_length( + &self, + accent_phrases: &[AccentPhrase], + style_id: StyleId, + ) -> Result> { + let (_, phoneme_data_list) = initial_process(accent_phrases); - let phoneme_list_s: Vec = phoneme_data_list - .iter() - .map(|phoneme_data| phoneme_data.phoneme_id()) - .collect(); - let phoneme_length = self.predict_duration(&phoneme_list_s, style_id).await?; + let (_, _, vowel_indexes_data) = split_mora(&phoneme_data_list); - let mut index = 0; - let new_accent_phrases = accent_phrases - .iter() - .map(|accent_phrase| AccentPhrase { - moras: accent_phrase - .moras - .iter() - .map(|mora| { - let new_mora = Mora { - consonant_length: mora.consonant.as_ref().map(|_| { - phoneme_length[vowel_indexes_data[index + 1] as usize - 1] - }), - vowel_length: phoneme_length - [vowel_indexes_data[index + 1] as usize], - ..mora.clone() - }; - index += 1; - new_mora - }) - .collect(), - pause_mora: accent_phrase.pause_mora.as_ref().map(|pause_mora| { - let new_pause_mora = Mora { + let phoneme_list_s: Vec = phoneme_data_list + .iter() + .map(|phoneme_data| phoneme_data.phoneme_id()) + .collect(); + let phoneme_length = self.predict_duration(&phoneme_list_s, style_id).await?; + + let mut index = 0; + let new_accent_phrases = accent_phrases + .iter() + .map(|accent_phrase| AccentPhrase { + moras: accent_phrase + .moras + .iter() + .map(|mora| { + let new_mora = Mora { + consonant_length: mora.consonant.as_ref().map(|_| { + phoneme_length[vowel_indexes_data[index + 1] as usize - 1] + }), vowel_length: phoneme_length[vowel_indexes_data[index + 1] as usize], - ..pause_mora.clone() + ..mora.clone() }; index += 1; - new_pause_mora - }), - ..accent_phrase.clone() - }) - .collect(); + new_mora + }) + .collect(), + pause_mora: accent_phrase.pause_mora.as_ref().map(|pause_mora| { + let new_pause_mora = Mora { + vowel_length: phoneme_length[vowel_indexes_data[index + 1] as usize], + ..pause_mora.clone() + }; + index += 1; + new_pause_mora + }), + ..accent_phrase.clone() + }) + .collect(); - Ok(new_accent_phrases) - } + Ok(new_accent_phrases) + } - pub(super) async fn replace_mora_pitch( - &self, - accent_phrases: &[AccentPhrase], - style_id: StyleId, - ) -> Result> { - let (_, phoneme_data_list) = initial_process(accent_phrases); - - let mut base_start_accent_list = vec![0]; - let mut base_end_accent_list = vec![0]; - let mut base_start_accent_phrase_list = vec![0]; - let mut base_end_accent_phrase_list = vec![0]; - for accent_phrase in accent_phrases { - let mut accent = usize::from(accent_phrase.accent != 1); - create_one_accent_list(&mut base_start_accent_list, accent_phrase, accent as i32); - - accent = accent_phrase.accent - 1; - create_one_accent_list(&mut base_end_accent_list, accent_phrase, accent as i32); - create_one_accent_list(&mut base_start_accent_phrase_list, accent_phrase, 0); - create_one_accent_list(&mut base_end_accent_phrase_list, accent_phrase, -1); - } - base_start_accent_list.push(0); - base_end_accent_list.push(0); - base_start_accent_phrase_list.push(0); - base_end_accent_phrase_list.push(0); + async fn replace_mora_pitch( + &self, + accent_phrases: &[AccentPhrase], + style_id: StyleId, + ) -> Result> { + let (_, phoneme_data_list) = initial_process(accent_phrases); + + let mut base_start_accent_list = vec![0]; + let mut base_end_accent_list = vec![0]; + let mut base_start_accent_phrase_list = vec![0]; + let mut base_end_accent_phrase_list = vec![0]; + for accent_phrase in accent_phrases { + let mut accent = usize::from(accent_phrase.accent != 1); + create_one_accent_list(&mut base_start_accent_list, accent_phrase, accent as i32); + + accent = accent_phrase.accent - 1; + create_one_accent_list(&mut base_end_accent_list, accent_phrase, accent as i32); + create_one_accent_list(&mut base_start_accent_phrase_list, accent_phrase, 0); + create_one_accent_list(&mut base_end_accent_phrase_list, accent_phrase, -1); + } + base_start_accent_list.push(0); + base_end_accent_list.push(0); + base_start_accent_phrase_list.push(0); + base_end_accent_phrase_list.push(0); + + let (consonant_phoneme_data_list, vowel_phoneme_data_list, vowel_indexes) = + split_mora(&phoneme_data_list); + + let consonant_phoneme_list: Vec = consonant_phoneme_data_list + .iter() + .map(|phoneme_data| phoneme_data.phoneme_id()) + .collect(); + let vowel_phoneme_list: Vec = vowel_phoneme_data_list + .iter() + .map(|phoneme_data| phoneme_data.phoneme_id()) + .collect(); - let (consonant_phoneme_data_list, vowel_phoneme_data_list, vowel_indexes) = - split_mora(&phoneme_data_list); + let mut start_accent_list = Vec::with_capacity(vowel_indexes.len()); + let mut end_accent_list = Vec::with_capacity(vowel_indexes.len()); + let mut start_accent_phrase_list = Vec::with_capacity(vowel_indexes.len()); + let mut end_accent_phrase_list = Vec::with_capacity(vowel_indexes.len()); - let consonant_phoneme_list: Vec = consonant_phoneme_data_list - .iter() - .map(|phoneme_data| phoneme_data.phoneme_id()) - .collect(); - let vowel_phoneme_list: Vec = vowel_phoneme_data_list - .iter() - .map(|phoneme_data| phoneme_data.phoneme_id()) - .collect(); - - let mut start_accent_list = Vec::with_capacity(vowel_indexes.len()); - let mut end_accent_list = Vec::with_capacity(vowel_indexes.len()); - let mut start_accent_phrase_list = Vec::with_capacity(vowel_indexes.len()); - let mut end_accent_phrase_list = Vec::with_capacity(vowel_indexes.len()); - - for vowel_index in vowel_indexes { - start_accent_list.push(base_start_accent_list[vowel_index as usize]); - end_accent_list.push(base_end_accent_list[vowel_index as usize]); - start_accent_phrase_list.push(base_start_accent_phrase_list[vowel_index as usize]); - end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]); - } + for vowel_index in vowel_indexes { + start_accent_list.push(base_start_accent_list[vowel_index as usize]); + end_accent_list.push(base_end_accent_list[vowel_index as usize]); + start_accent_phrase_list.push(base_start_accent_phrase_list[vowel_index as usize]); + end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]); + } - let mut f0_list = self - .predict_intonation( - vowel_phoneme_list.len(), - &vowel_phoneme_list, - &consonant_phoneme_list, - &start_accent_list, - &end_accent_list, - &start_accent_phrase_list, - &end_accent_phrase_list, - style_id, - ) - .await?; + let mut f0_list = self + .predict_intonation( + vowel_phoneme_list.len(), + &vowel_phoneme_list, + &consonant_phoneme_list, + &start_accent_list, + &end_accent_list, + &start_accent_phrase_list, + &end_accent_phrase_list, + style_id, + ) + .await?; - for i in 0..vowel_phoneme_data_list.len() { - const UNVOICED_MORA_PHONEME_LIST: &[&str] = &["A", "I", "U", "E", "O", "cl", "pau"]; + for i in 0..vowel_phoneme_data_list.len() { + const UNVOICED_MORA_PHONEME_LIST: &[&str] = &["A", "I", "U", "E", "O", "cl", "pau"]; - if UNVOICED_MORA_PHONEME_LIST - .iter() - .any(|phoneme| *phoneme == vowel_phoneme_data_list[i].phoneme()) - { - f0_list[i] = 0.; - } + if UNVOICED_MORA_PHONEME_LIST + .iter() + .any(|phoneme| *phoneme == vowel_phoneme_data_list[i].phoneme()) + { + f0_list[i] = 0.; } + } - let mut index = 0; - let new_accent_phrases = accent_phrases - .iter() - .map(|accent_phrase| AccentPhrase { - moras: accent_phrase - .moras - .iter() - .map(|mora| { - let new_mora = Mora { - pitch: f0_list[index + 1], - ..mora.clone() - }; - index += 1; - new_mora - }) - .collect(), - pause_mora: accent_phrase.pause_mora.as_ref().map(|pause_mora| { - let new_pause_mora = Mora { + let mut index = 0; + let new_accent_phrases = accent_phrases + .iter() + .map(|accent_phrase| AccentPhrase { + moras: accent_phrase + .moras + .iter() + .map(|mora| { + let new_mora = Mora { pitch: f0_list[index + 1], - ..pause_mora.clone() + ..mora.clone() }; index += 1; - new_pause_mora - }), - ..accent_phrase.clone() - }) - .collect(); - - return Ok(new_accent_phrases); + new_mora + }) + .collect(), + pause_mora: accent_phrase.pause_mora.as_ref().map(|pause_mora| { + let new_pause_mora = Mora { + pitch: f0_list[index + 1], + ..pause_mora.clone() + }; + index += 1; + new_pause_mora + }), + ..accent_phrase.clone() + }) + .collect(); - fn create_one_accent_list( - accent_list: &mut Vec, - accent_phrase: &AccentPhrase, - point: i32, - ) { - let mut one_accent_list: Vec = Vec::new(); + return Ok(new_accent_phrases); - for (i, mora) in accent_phrase.moras.iter().enumerate() { - let value = (i as i32 == point - || (point < 0 && i == (accent_phrase.moras.len() as i32 + point) as usize)) - .into(); + fn create_one_accent_list( + accent_list: &mut Vec, + accent_phrase: &AccentPhrase, + point: i32, + ) { + let mut one_accent_list: Vec = Vec::new(); + + for (i, mora) in accent_phrase.moras.iter().enumerate() { + let value = (i as i32 == point + || (point < 0 && i == (accent_phrase.moras.len() as i32 + point) as usize)) + .into(); + one_accent_list.push(value); + if mora.consonant.is_some() { one_accent_list.push(value); - if mora.consonant.is_some() { - one_accent_list.push(value); - } - } - if accent_phrase.pause_mora.is_some() { - one_accent_list.push(0); } - accent_list.extend(one_accent_list) } + if accent_phrase.pause_mora.is_some() { + one_accent_list.push(0); + } + accent_list.extend(one_accent_list) } + } - pub(super) async fn create_audio_query_from_kana( - &self, - kana: &str, - style_id: StyleId, - ) -> Result { - let accent_phrases = self.create_accent_phrases_from_kana(kana, style_id).await?; - Ok(AudioQuery::from_accent_phrases(accent_phrases).with_kana(Some(kana.to_owned()))) - } - - pub(super) async fn tts_from_kana( - &self, - kana: &str, - style_id: StyleId, - options: &TtsOptions, - ) -> Result> { - let audio_query = &self.create_audio_query_from_kana(kana, style_id).await?; - self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) - .await - } + async fn create_audio_query_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> Result { + let accent_phrases = self.create_accent_phrases_from_kana(kana, style_id).await?; + Ok(AudioQuery::from_accent_phrases(accent_phrases).with_kana(Some(kana.to_owned()))) } - impl Inner { - pub(super) async fn create_accent_phrases( - &self, - text: &str, - style_id: StyleId, - ) -> Result> { - let accent_phrases = self.open_jtalk_analyzer.analyze(text)?; - self.replace_mora_data(&accent_phrases, style_id).await - } + async fn tts_from_kana( + &self, + kana: &str, + style_id: StyleId, + options: &TtsOptions, + ) -> Result> { + let audio_query = &self.create_audio_query_from_kana(kana, style_id).await?; + self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) + .await + } +} - pub(super) async fn create_audio_query( - &self, - text: &str, - style_id: StyleId, - ) -> Result { - let accent_phrases = self.create_accent_phrases(text, style_id).await?; - Ok(AudioQuery::from_accent_phrases(accent_phrases)) - } +impl Inner { + async fn create_accent_phrases( + &self, + text: &str, + style_id: StyleId, + ) -> Result> { + let accent_phrases = self.open_jtalk_analyzer.analyze(text)?; + self.replace_mora_data(&accent_phrases, style_id).await + } - pub(super) async fn tts( - &self, - text: &str, - style_id: StyleId, - options: &TtsOptions, - ) -> Result> { - let audio_query = &self.create_audio_query(text, style_id).await?; - self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) - .await - } + async fn create_audio_query(&self, text: &str, style_id: StyleId) -> Result { + let accent_phrases = self.create_accent_phrases(text, style_id).await?; + Ok(AudioQuery::from_accent_phrases(accent_phrases)) } - // TODO: この層を破壊する - impl Inner { - pub(super) async fn predict_duration( - &self, - phoneme_vector: &[i64], - style_id: StyleId, - ) -> Result> { - let status = self.status.clone(); - let phoneme_vector = ndarray::arr1(phoneme_vector); - status.predict_duration::(phoneme_vector, style_id).await - } - - #[expect( - clippy::too_many_arguments, - reason = "compatible_engineでの`predict_intonation`の形を考えると、ここの引数を構造体に\ - まとめたりしても可読性に寄与しない" - )] - pub(super) async fn predict_intonation( - &self, - length: usize, - vowel_phoneme_vector: &[i64], - consonant_phoneme_vector: &[i64], - start_accent_vector: &[i64], - end_accent_vector: &[i64], - start_accent_phrase_vector: &[i64], - end_accent_phrase_vector: &[i64], - style_id: StyleId, - ) -> Result> { - let status = self.status.clone(); - let vowel_phoneme_vector = ndarray::arr1(vowel_phoneme_vector); - let consonant_phoneme_vector = ndarray::arr1(consonant_phoneme_vector); - let start_accent_vector = ndarray::arr1(start_accent_vector); - let end_accent_vector = ndarray::arr1(end_accent_vector); - let start_accent_phrase_vector = ndarray::arr1(start_accent_phrase_vector); - let end_accent_phrase_vector = ndarray::arr1(end_accent_phrase_vector); - status - .predict_intonation::( - length, - vowel_phoneme_vector, - consonant_phoneme_vector, - start_accent_vector, - end_accent_vector, - start_accent_phrase_vector, - end_accent_phrase_vector, - style_id, - ) - .await - } + async fn tts(&self, text: &str, style_id: StyleId, options: &TtsOptions) -> Result> { + let audio_query = &self.create_audio_query(text, style_id).await?; + self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) + .await + } +} - pub(super) async fn generate_full_intermediate( - &self, - length: usize, - phoneme_size: usize, - f0: &[f32], - phoneme_vector: &[f32], - style_id: StyleId, - ) -> Result> { - let status = self.status.clone(); - let f0 = ndarray::arr1(f0); - let phoneme_vector = ndarray::arr1(phoneme_vector); - status - .generate_full_intermediate::(length, phoneme_size, f0, phoneme_vector, style_id) - .await - } +// TODO: この層を破壊する +impl Inner { + async fn predict_duration( + &self, + phoneme_vector: &[i64], + style_id: StyleId, + ) -> Result> { + let status = self.status.clone(); + let phoneme_vector = ndarray::arr1(phoneme_vector); + status.predict_duration::(phoneme_vector, style_id).await + } - pub(super) async fn render_audio_segment( - &self, - spec: ndarray::Array2, - style_id: StyleId, - ) -> Result> { - let status = self.status.clone(); - status.render_audio_segment::(spec, style_id).await - } + #[expect( + clippy::too_many_arguments, + reason = "compatible_engineでの`predict_intonation`の形を考えると、ここの引数を構造体に\ + まとめたりしても可読性に寄与しない" + )] + async fn predict_intonation( + &self, + length: usize, + vowel_phoneme_vector: &[i64], + consonant_phoneme_vector: &[i64], + start_accent_vector: &[i64], + end_accent_vector: &[i64], + start_accent_phrase_vector: &[i64], + end_accent_phrase_vector: &[i64], + style_id: StyleId, + ) -> Result> { + let status = self.status.clone(); + let vowel_phoneme_vector = ndarray::arr1(vowel_phoneme_vector); + let consonant_phoneme_vector = ndarray::arr1(consonant_phoneme_vector); + let start_accent_vector = ndarray::arr1(start_accent_vector); + let end_accent_vector = ndarray::arr1(end_accent_vector); + let start_accent_phrase_vector = ndarray::arr1(start_accent_phrase_vector); + let end_accent_phrase_vector = ndarray::arr1(end_accent_phrase_vector); + status + .predict_intonation::( + length, + vowel_phoneme_vector, + consonant_phoneme_vector, + start_accent_vector, + end_accent_vector, + start_accent_phrase_vector, + end_accent_phrase_vector, + style_id, + ) + .await + } - pub(super) async fn decode( - &self, - length: usize, - phoneme_size: usize, - f0: &[f32], - phoneme_vector: &[f32], - style_id: StyleId, - ) -> Result> { - let status = self.status.clone(); - let f0 = ndarray::arr1(f0); - let phoneme_vector = ndarray::arr1(phoneme_vector); - status - .decode::(length, phoneme_size, f0, phoneme_vector, style_id) - .await - } + async fn generate_full_intermediate( + &self, + length: usize, + phoneme_size: usize, + f0: &[f32], + phoneme_vector: &[f32], + style_id: StyleId, + ) -> Result> { + let status = self.status.clone(); + let f0 = ndarray::arr1(f0); + let phoneme_vector = ndarray::arr1(phoneme_vector); + status + .generate_full_intermediate::(length, phoneme_size, f0, phoneme_vector, style_id) + .await } - impl Status { - pub(super) async fn predict_duration( - &self, - phoneme_vector: ndarray::Array1, - style_id: StyleId, - ) -> Result> { - let (model_id, inner_voice_id) = self.ids_for::(style_id)?; - - let PredictDurationOutput { - phoneme_length: output, - } = self - .run_session::( - model_id, - PredictDurationInput { - phoneme_list: phoneme_vector, - speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), - }, - ) - .await?; - let mut output = output.into_raw_vec(); + async fn render_audio_segment( + &self, + spec: ndarray::Array2, + style_id: StyleId, + ) -> Result> { + let status = self.status.clone(); + status.render_audio_segment::(spec, style_id).await + } - for output_item in output.iter_mut() { - if *output_item < PHONEME_LENGTH_MINIMAL { - *output_item = PHONEME_LENGTH_MINIMAL; - } - } + async fn decode( + &self, + length: usize, + phoneme_size: usize, + f0: &[f32], + phoneme_vector: &[f32], + style_id: StyleId, + ) -> Result> { + let status = self.status.clone(); + let f0 = ndarray::arr1(f0); + let phoneme_vector = ndarray::arr1(phoneme_vector); + status + .decode::(length, phoneme_size, f0, phoneme_vector, style_id) + .await + } +} - return Ok(output); +impl Status { + async fn predict_duration( + &self, + phoneme_vector: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let PredictDurationOutput { + phoneme_length: output, + } = self + .run_session::( + model_id, + PredictDurationInput { + phoneme_list: phoneme_vector, + speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), + }, + ) + .await?; + let mut output = output.into_raw_vec(); - const PHONEME_LENGTH_MINIMAL: f32 = 0.01; + for output_item in output.iter_mut() { + if *output_item < PHONEME_LENGTH_MINIMAL { + *output_item = PHONEME_LENGTH_MINIMAL; + } } - #[expect( - clippy::too_many_arguments, - reason = "compatible_engineでの`predict_intonation`の形を考えると、ここの引数を構造体に\ - まとめたりしても可読性に寄与しない" - )] - pub(super) async fn predict_intonation( - &self, - length: usize, - vowel_phoneme_vector: ndarray::Array1, - consonant_phoneme_vector: ndarray::Array1, - start_accent_vector: ndarray::Array1, - end_accent_vector: ndarray::Array1, - start_accent_phrase_vector: ndarray::Array1, - end_accent_phrase_vector: ndarray::Array1, - style_id: StyleId, - ) -> Result> { - let (model_id, inner_voice_id) = self.ids_for::(style_id)?; - - let PredictIntonationOutput { f0_list: output } = self - .run_session::( - model_id, - PredictIntonationInput { - length: ndarray::arr0(length as i64), - vowel_phoneme_list: vowel_phoneme_vector, - consonant_phoneme_list: consonant_phoneme_vector, - start_accent_list: start_accent_vector, - end_accent_list: end_accent_vector, - start_accent_phrase_list: start_accent_phrase_vector, - end_accent_phrase_list: end_accent_phrase_vector, - speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), - }, - ) - .await?; + return Ok(output); - Ok(output.into_raw_vec()) - } + const PHONEME_LENGTH_MINIMAL: f32 = 0.01; + } - /// モデル`generate_full_intermediate`の実行と、その前後の処理を行う。 - /// - /// 無音パディングを付加して音声特徴量を計算し、マージン込みの音声特徴量を返す。 - pub(super) async fn generate_full_intermediate( - &self, - length: usize, - phoneme_size: usize, - f0: ndarray::Array1, - phoneme_vector: ndarray::Array1, - style_id: StyleId, - ) -> Result> { - let (model_id, inner_voice_id) = self.ids_for::(style_id)?; - - // 音が途切れてしまうのを避けるworkaround処理が入っている - // TODO: 改善したらここのpadding処理を取り除く - let start_and_end_padding_size = 2 * PADDING_FRAME_LENGTH; - let length_with_padding = f0.len() + start_and_end_padding_size; - let f0_with_padding = make_f0_with_padding(f0, PADDING_FRAME_LENGTH); - let phoneme_with_padding = make_phoneme_with_padding( - phoneme_vector.into_shape([length, phoneme_size]).unwrap(), - PADDING_FRAME_LENGTH, - ); + #[expect( + clippy::too_many_arguments, + reason = "compatible_engineでの`predict_intonation`の形を考えると、ここの引数を構造体に\ + まとめたりしても可読性に寄与しない" + )] + async fn predict_intonation( + &self, + length: usize, + vowel_phoneme_vector: ndarray::Array1, + consonant_phoneme_vector: ndarray::Array1, + start_accent_vector: ndarray::Array1, + end_accent_vector: ndarray::Array1, + start_accent_phrase_vector: ndarray::Array1, + end_accent_phrase_vector: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let PredictIntonationOutput { f0_list: output } = self + .run_session::( + model_id, + PredictIntonationInput { + length: ndarray::arr0(length as i64), + vowel_phoneme_list: vowel_phoneme_vector, + consonant_phoneme_list: consonant_phoneme_vector, + start_accent_list: start_accent_vector, + end_accent_list: end_accent_vector, + start_accent_phrase_list: start_accent_phrase_vector, + end_accent_phrase_list: end_accent_phrase_vector, + speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), + }, + ) + .await?; - let GenerateFullIntermediateOutput { - spec: spec_with_padding, - } = self - .run_session::( - model_id, - GenerateFullIntermediateInput { - f0: f0_with_padding - .into_shape([length_with_padding, 1]) - .unwrap(), - phoneme: phoneme_with_padding, - speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), - }, - ) - .await?; + Ok(output.into_raw_vec()) + } - // マージンがデータからはみ出さないことを保証 - // cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291 - if MARGIN > PADDING_FRAME_LENGTH { - unreachable!("Validation error: Too short padding for input, please report this issue on GitHub."); - } - // マージン分を両端に残して音声特徴量を返す - return Ok(spec_with_padding - .slice(ndarray::s![ - PADDING_FRAME_LENGTH - MARGIN - ..spec_with_padding.nrows() - PADDING_FRAME_LENGTH + MARGIN, - .. - ]) - .to_owned()); - - fn make_f0_with_padding( - f0_slice: ndarray::Array1, - padding_size: usize, - ) -> ndarray::Array1 { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let padding = ndarray::Array1::::zeros(padding_size); - ndarray::concatenate![ndarray::Axis(0), padding, f0_slice, padding] - } + /// モデル`generate_full_intermediate`の実行と、その前後の処理を行う。 + /// + /// 無音パディングを付加して音声特徴量を計算し、マージン込みの音声特徴量を返す。 + async fn generate_full_intermediate( + &self, + length: usize, + phoneme_size: usize, + f0: ndarray::Array1, + phoneme_vector: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + // 音が途切れてしまうのを避けるworkaround処理が入っている + // TODO: 改善したらここのpadding処理を取り除く + let start_and_end_padding_size = 2 * PADDING_FRAME_LENGTH; + let length_with_padding = f0.len() + start_and_end_padding_size; + let f0_with_padding = make_f0_with_padding(f0, PADDING_FRAME_LENGTH); + let phoneme_with_padding = make_phoneme_with_padding( + phoneme_vector.into_shape([length, phoneme_size]).unwrap(), + PADDING_FRAME_LENGTH, + ); - fn make_phoneme_with_padding( - phoneme_slice: ndarray::Array2, - padding_size: usize, - ) -> ndarray::Array2 { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let mut padding = - ndarray::Array2::::zeros((padding_size, phoneme_slice.ncols())); - padding - .slice_mut(ndarray::s![.., 0]) - .assign(&ndarray::arr0(1.0)); - ndarray::concatenate![ndarray::Axis(0), padding, phoneme_slice, padding] - } + let GenerateFullIntermediateOutput { + spec: spec_with_padding, + } = self + .run_session::( + model_id, + GenerateFullIntermediateInput { + f0: f0_with_padding + .into_shape([length_with_padding, 1]) + .unwrap(), + phoneme: phoneme_with_padding, + speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), + }, + ) + .await?; + + // マージンがデータからはみ出さないことを保証 + // cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291 + if MARGIN > PADDING_FRAME_LENGTH { + unreachable!("Validation error: Too short padding for input, please report this issue on GitHub."); + } + // マージン分を両端に残して音声特徴量を返す + return Ok(spec_with_padding + .slice(ndarray::s![ + PADDING_FRAME_LENGTH - MARGIN + ..spec_with_padding.nrows() - PADDING_FRAME_LENGTH + MARGIN, + .. + ]) + .to_owned()); + + fn make_f0_with_padding( + f0_slice: ndarray::Array1, + padding_size: usize, + ) -> ndarray::Array1 { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let padding = ndarray::Array1::::zeros(padding_size); + ndarray::concatenate![ndarray::Axis(0), padding, f0_slice, padding] + } + + fn make_phoneme_with_padding( + phoneme_slice: ndarray::Array2, + padding_size: usize, + ) -> ndarray::Array2 { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let mut padding = ndarray::Array2::::zeros((padding_size, phoneme_slice.ncols())); + padding + .slice_mut(ndarray::s![.., 0]) + .assign(&ndarray::arr0(1.0)); + ndarray::concatenate![ndarray::Axis(0), padding, phoneme_slice, padding] } + } - /// 与えられた音声特徴量で音声生成。 - pub(super) async fn render_audio_segment( - &self, - spec: ndarray::Array2, - style_id: StyleId, - ) -> Result> { - let (model_id, _inner_voice_id) = self.ids_for::(style_id)?; - let RenderAudioSegmentOutput { wave } = self - .run_session::(model_id, RenderAudioSegmentInput { spec }) - .await?; - Ok(wave) - } + /// 与えられた音声特徴量で音声生成。 + async fn render_audio_segment( + &self, + spec: ndarray::Array2, + style_id: StyleId, + ) -> Result> { + let (model_id, _inner_voice_id) = self.ids_for::(style_id)?; + let RenderAudioSegmentOutput { wave } = self + .run_session::(model_id, RenderAudioSegmentInput { spec }) + .await?; + Ok(wave) + } - pub(super) async fn decode( - &self, - length: usize, - phoneme_size: usize, - f0: ndarray::Array1, - phoneme_vector: ndarray::Array1, - style_id: StyleId, - ) -> Result> { - let intermediate = self - .generate_full_intermediate::(length, phoneme_size, f0, phoneme_vector, style_id) - .await?; - let output_with_margin = self - .render_audio_segment::(intermediate, style_id) - .await?; - let output = trim_margin_from_wave(output_with_margin); - Ok(output.to_vec()) - } - - pub(super) async fn predict_sing_consonant_length( - &self, - consonant: ndarray::Array1, - vowel: ndarray::Array1, - note_duration: ndarray::Array1, - style_id: StyleId, - ) -> Result> { - let (model_id, inner_voice_id) = self.ids_for::(style_id)?; - - let PredictSingConsonantLengthOutput { consonant_lengths } = self - .run_session::( - model_id, - PredictSingConsonantLengthInput { - consonants: consonant.into_one_row(), - vowels: vowel.into_one_row(), - note_durations: note_duration.into_one_row(), - speaker_id: ndarray::array![inner_voice_id.raw_id().into()], - }, - ) - .await?; + async fn decode( + &self, + length: usize, + phoneme_size: usize, + f0: ndarray::Array1, + phoneme_vector: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let intermediate = self + .generate_full_intermediate::(length, phoneme_size, f0, phoneme_vector, style_id) + .await?; + let output_with_margin = self + .render_audio_segment::(intermediate, style_id) + .await?; + let output = trim_margin_from_wave(output_with_margin); + Ok(output.to_vec()) + } - Ok(consonant_lengths) - } + async fn predict_sing_consonant_length( + &self, + consonant: ndarray::Array1, + vowel: ndarray::Array1, + note_duration: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let PredictSingConsonantLengthOutput { consonant_lengths } = self + .run_session::( + model_id, + PredictSingConsonantLengthInput { + consonants: consonant.into_one_row(), + vowels: vowel.into_one_row(), + note_durations: note_duration.into_one_row(), + speaker_id: ndarray::array![inner_voice_id.raw_id().into()], + }, + ) + .await?; - pub(super) async fn predict_sing_f0( - &self, - phoneme: ndarray::Array1, - note: ndarray::Array1, - style_id: StyleId, - ) -> Result> { - let (model_id, inner_voice_id) = self.ids_for::(style_id)?; - - let PredictSingF0Output { f0s } = self - .run_session::( - model_id, - PredictSingF0Input { - phonemes: phoneme.into_one_row(), - notes: note.into_one_row(), - speaker_id: ndarray::array![inner_voice_id.raw_id().into()], - }, - ) - .await?; + Ok(consonant_lengths) + } - Ok(f0s) - } + async fn predict_sing_f0( + &self, + phoneme: ndarray::Array1, + note: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let PredictSingF0Output { f0s } = self + .run_session::( + model_id, + PredictSingF0Input { + phonemes: phoneme.into_one_row(), + notes: note.into_one_row(), + speaker_id: ndarray::array![inner_voice_id.raw_id().into()], + }, + ) + .await?; - pub(super) async fn predict_sing_volume( - &self, - phoneme: ndarray::Array1, - note: ndarray::Array1, - f0: ndarray::Array1, - style_id: StyleId, - ) -> Result> { - let (model_id, inner_voice_id) = self.ids_for::(style_id)?; - - let PredictSingVolumeOutput { volumes } = self - .run_session::( - model_id, - PredictSingVolumeInput { - phonemes: phoneme.into_one_row(), - notes: note.into_one_row(), - frame_f0s: f0.into_one_row(), - speaker_id: ndarray::array![inner_voice_id.raw_id().into()], - }, - ) - .await?; + Ok(f0s) + } - Ok(volumes) - } + async fn predict_sing_volume( + &self, + phoneme: ndarray::Array1, + note: ndarray::Array1, + f0: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let PredictSingVolumeOutput { volumes } = self + .run_session::( + model_id, + PredictSingVolumeInput { + phonemes: phoneme.into_one_row(), + notes: note.into_one_row(), + frame_f0s: f0.into_one_row(), + speaker_id: ndarray::array![inner_voice_id.raw_id().into()], + }, + ) + .await?; - pub(super) async fn sf_decode( - &self, - phoneme: ndarray::Array1, - f0: ndarray::Array1, - volume: ndarray::Array1, - style_id: StyleId, - ) -> Result> { - let (model_id, inner_voice_id) = self.ids_for::(style_id)?; - - let SfDecodeOutput { wav } = self - .run_session::( - model_id, - SfDecodeInput { - frame_phonemes: phoneme.into_one_row(), - frame_f0s: f0.into_one_row(), - frame_volumes: volume.into_one_row(), - speaker_id: ndarray::array![inner_voice_id.raw_id().into()], - }, - ) - .await?; + Ok(volumes) + } - Ok(wav) - } + async fn sf_decode( + &self, + phoneme: ndarray::Array1, + f0: ndarray::Array1, + volume: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let SfDecodeOutput { wav } = self + .run_session::( + model_id, + SfDecodeInput { + frame_phonemes: phoneme.into_one_row(), + frame_f0s: f0.into_one_row(), + frame_volumes: volume.into_one_row(), + speaker_id: ndarray::array![inner_voice_id.raw_id().into()], + }, + ) + .await?; + + Ok(wav) } +} - #[ext] - impl ndarray::Array1 { - fn into_one_row(self) -> ndarray::Array2 { - let n = self.len(); - self.into_shape([1, n]).expect("should be ok") - } +#[ext] +impl ndarray::Array1 { + fn into_one_row(self) -> ndarray::Array2 { + let n = self.len(); + self.into_shape([1, n]).expect("should be ok") } +} - #[cfg(windows)] - fn list_windows_video_cards() { - use std::{ffi::OsString, os::windows::ffi::OsStringExt as _}; +#[cfg(windows)] +fn list_windows_video_cards() { + use std::{ffi::OsString, os::windows::ffi::OsStringExt as _}; - use humansize::BINARY; - use tracing::{error, info}; - use windows::Win32::Graphics::Dxgi::{ - CreateDXGIFactory, IDXGIFactory, DXGI_ADAPTER_DESC, DXGI_ERROR_NOT_FOUND, - }; + use humansize::BINARY; + use tracing::{error, info}; + use windows::Win32::Graphics::Dxgi::{ + CreateDXGIFactory, IDXGIFactory, DXGI_ADAPTER_DESC, DXGI_ERROR_NOT_FOUND, + }; - info!("検出されたGPU (DirectMLにはGPU 0が使われます):"); - match list_windows_video_cards() { - Ok(descs) => { - for (device_id, desc) in descs.into_iter().enumerate() { - let description = OsString::from_wide(trim_nul(&desc.Description)); - let vram = humansize::format_size(desc.DedicatedVideoMemory, BINARY); - info!(" GPU {device_id}: {description:?} ({vram})"); - } + info!("検出されたGPU (DirectMLにはGPU 0が使われます):"); + match list_windows_video_cards() { + Ok(descs) => { + for (device_id, desc) in descs.into_iter().enumerate() { + let description = OsString::from_wide(trim_nul(&desc.Description)); + let vram = humansize::format_size(desc.DedicatedVideoMemory, BINARY); + info!(" GPU {device_id}: {description:?} ({vram})"); } - Err(err) => error!("{err}"), } + Err(err) => error!("{err}"), + } - fn list_windows_video_cards() -> windows::core::Result> { - unsafe { - let factory = CreateDXGIFactory::()?; - (0..) - .map(|i| factory.EnumAdapters(i)?.GetDesc()) - .take_while(|r| !matches!(r, Err(e) if e.code() == DXGI_ERROR_NOT_FOUND)) - .collect() - } + fn list_windows_video_cards() -> windows::core::Result> { + unsafe { + let factory = CreateDXGIFactory::()?; + (0..) + .map(|i| factory.EnumAdapters(i)?.GetDesc()) + .take_while(|r| !matches!(r, Err(e) if e.code() == DXGI_ERROR_NOT_FOUND)) + .collect() } + } - fn trim_nul(s: &[u16]) -> &[u16] { - &s[..s.iter().position(|&c| c == 0x0000).unwrap_or(s.len())] - } + fn trim_nul(s: &[u16]) -> &[u16] { + &s[..s.iter().position(|&c| c == 0x0000).unwrap_or(s.len())] } +} - fn initial_process(accent_phrases: &[AccentPhrase]) -> (Vec, Vec) { - let flatten_moras = to_flatten_moras(accent_phrases); +fn initial_process(accent_phrases: &[AccentPhrase]) -> (Vec, Vec) { + let flatten_moras = to_flatten_moras(accent_phrases); - let mut phoneme_strings = vec!["pau".to_string()]; - for mora in flatten_moras.iter() { - if let Some(consonant) = &mora.consonant { - phoneme_strings.push(consonant.clone()) - } - phoneme_strings.push(mora.vowel.clone()); + let mut phoneme_strings = vec!["pau".to_string()]; + for mora in flatten_moras.iter() { + if let Some(consonant) = &mora.consonant { + phoneme_strings.push(consonant.clone()) } - phoneme_strings.push("pau".to_string()); + phoneme_strings.push(mora.vowel.clone()); + } + phoneme_strings.push("pau".to_string()); - let phoneme_data_list = to_phoneme_data_list(&phoneme_strings); + let phoneme_data_list = to_phoneme_data_list(&phoneme_strings); - return (flatten_moras, phoneme_data_list); + return (flatten_moras, phoneme_data_list); - fn to_flatten_moras(accent_phrases: &[AccentPhrase]) -> Vec { - let mut flatten_moras = Vec::new(); + fn to_flatten_moras(accent_phrases: &[AccentPhrase]) -> Vec { + let mut flatten_moras = Vec::new(); - for AccentPhrase { - moras, pause_mora, .. - } in accent_phrases - { - for mora in moras { - flatten_moras.push(mora.clone()); - } - if let Some(pause_mora) = pause_mora { - flatten_moras.push(pause_mora.clone()); - } + for AccentPhrase { + moras, pause_mora, .. + } in accent_phrases + { + for mora in moras { + flatten_moras.push(mora.clone()); + } + if let Some(pause_mora) = pause_mora { + flatten_moras.push(pause_mora.clone()); } - - flatten_moras } - fn to_phoneme_data_list>(phoneme_str_list: &[T]) -> Vec { - OjtPhoneme::convert( - phoneme_str_list - .iter() - .map(AsRef::as_ref) - .map(ToOwned::to_owned) - .map(OjtPhoneme::new) - .collect::>() - .as_slice(), - ) - } + flatten_moras } - fn split_mora(phoneme_list: &[OjtPhoneme]) -> (Vec, Vec, Vec) { - let mut vowel_indexes = Vec::new(); - for (i, phoneme) in phoneme_list.iter().enumerate() { - const MORA_PHONEME_LIST: &[&str] = &[ - "a", "i", "u", "e", "o", "N", "A", "I", "U", "E", "O", "cl", "pau", - ]; - - if MORA_PHONEME_LIST + fn to_phoneme_data_list>(phoneme_str_list: &[T]) -> Vec { + OjtPhoneme::convert( + phoneme_str_list .iter() - .any(|mora_phoneme| *mora_phoneme == phoneme.phoneme()) - { - vowel_indexes.push(i as i64); - } - } + .map(AsRef::as_ref) + .map(ToOwned::to_owned) + .map(OjtPhoneme::new) + .collect::>() + .as_slice(), + ) + } +} - let vowel_phoneme_list = vowel_indexes +fn split_mora(phoneme_list: &[OjtPhoneme]) -> (Vec, Vec, Vec) { + let mut vowel_indexes = Vec::new(); + for (i, phoneme) in phoneme_list.iter().enumerate() { + const MORA_PHONEME_LIST: &[&str] = &[ + "a", "i", "u", "e", "o", "N", "A", "I", "U", "E", "O", "cl", "pau", + ]; + + if MORA_PHONEME_LIST .iter() - .map(|vowel_index| phoneme_list[*vowel_index as usize].clone()) - .collect(); + .any(|mora_phoneme| *mora_phoneme == phoneme.phoneme()) + { + vowel_indexes.push(i as i64); + } + } - let mut consonant_phoneme_list = vec![OjtPhoneme::default()]; - for i in 0..(vowel_indexes.len() - 1) { - let prev = vowel_indexes[i]; - let next = vowel_indexes[i + 1]; - if next - prev == 1 { - consonant_phoneme_list.push(OjtPhoneme::default()); - } else { - consonant_phoneme_list.push(phoneme_list[next as usize - 1].clone()); - } + let vowel_phoneme_list = vowel_indexes + .iter() + .map(|vowel_index| phoneme_list[*vowel_index as usize].clone()) + .collect(); + + let mut consonant_phoneme_list = vec![OjtPhoneme::default()]; + for i in 0..(vowel_indexes.len() - 1) { + let prev = vowel_indexes[i]; + let next = vowel_indexes[i + 1]; + if next - prev == 1 { + consonant_phoneme_list.push(OjtPhoneme::default()); + } else { + consonant_phoneme_list.push(phoneme_list[next as usize - 1].clone()); } + } - (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes) - } - - impl AudioQuery { - fn from_accent_phrases(accent_phrases: Vec) -> Self { - let kana = create_kana(&accent_phrases); - Self { - accent_phrases, - speed_scale: 1., - pitch_scale: 0., - intonation_scale: 1., - volume_scale: 1., - pre_phoneme_length: 0.1, - post_phoneme_length: 0.1, - output_sampling_rate: DEFAULT_SAMPLING_RATE, - output_stereo: false, - pause_length: (), - pause_length_scale: (), - kana: Some(kana), - } + (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes) +} + +impl AudioQuery { + fn from_accent_phrases(accent_phrases: Vec) -> Self { + let kana = create_kana(&accent_phrases); + Self { + accent_phrases, + speed_scale: 1., + pitch_scale: 0., + intonation_scale: 1., + volume_scale: 1., + pre_phoneme_length: 0.1, + post_phoneme_length: 0.1, + output_sampling_rate: DEFAULT_SAMPLING_RATE, + output_stereo: false, + pause_length: (), + pause_length_scale: (), + kana: Some(kana), } } } @@ -1324,9 +1294,9 @@ pub(crate) mod blocking { FullcontextExtractor, StyleId, VoiceModelId, VoiceModelMeta, }; - use super::{inner::Inner, InitializeOptions, SynthesisOptions, TtsOptions}; + use super::{InitializeOptions, Inner, SynthesisOptions, TtsOptions}; - pub use super::inner::AudioFeature; + pub use super::AudioFeature; /// 音声シンセサイザ。 pub struct Synthesizer(pub(super) Inner); @@ -1772,7 +1742,7 @@ pub(crate) mod nonblocking { StyleId, SynthesisOptions, VoiceModelId, VoiceModelMeta, }; - use super::{inner::Inner, InitializeOptions, TtsOptions}; + use super::{InitializeOptions, Inner, TtsOptions}; /// 音声シンセサイザ。 /// diff --git a/crates/voicevox_core_c_api/tests/e2e/log_mask.rs b/crates/voicevox_core_c_api/tests/e2e/log_mask.rs index f4771ac3e..a9ffe0f6e 100644 --- a/crates/voicevox_core_c_api/tests/e2e/log_mask.rs +++ b/crates/voicevox_core_c_api/tests/e2e/log_mask.rs @@ -31,7 +31,7 @@ impl Utf8Output { pub(crate) fn mask_windows_video_cards(self) -> Self { self.mask_stderr( static_regex!( - r#"(?m)^\{timestamp\} INFO voicevox_core::synthesizer::inner: 検出されたGPU \(DirectMLにはGPU 0が使われます\):(\n\{timestamp\} INFO voicevox_core::synthesizer::inner: GPU [0-9]+: "[^"]+" \([0-9.]+ [a-zA-Z]+\))+"#, + r#"(?m)^\{timestamp\} INFO voicevox_core::synthesizer: 検出されたGPU \(DirectMLにはGPU 0が使われます\):(\n\{timestamp\} INFO voicevox_core::synthesizer: GPU [0-9]+: "[^"]+" \([0-9.]+ [a-zA-Z]+\))+"#, ), "{windows-video-cards}", ) diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml index c45292fad..87a704af4 100644 --- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml +++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml @@ -72,10 +72,10 @@ metas = ''' ]''' stderr.windows = ''' {windows-video-cards} -{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer: CPUを利用します ''' stderr.unix = ''' -{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer: CPUを利用します ''' [compatible_engine_load_model_before_initialize] @@ -138,10 +138,10 @@ stderr = '' output."こんにちは、音声合成の世界へようこそ".wav_length = 176172 stderr.windows = ''' {windows-video-cards} -{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer: CPUを利用します ''' stderr.unix = ''' -{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer: CPUを利用します ''' [synthesizer_new_output_json] @@ -218,29 +218,29 @@ metas = ''' ]''' stderr.windows = ''' {windows-video-cards} -{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer: CPUを利用します ''' stderr.unix = ''' -{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer: CPUを利用します ''' [tts_via_audio_query] output."こんにちは、音声合成の世界へようこそ".wav_length = 176172 stderr.windows = ''' {windows-video-cards} -{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer: CPUを利用します ''' stderr.unix = ''' -{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer: CPUを利用します ''' [user_dict_load] stderr.windows = ''' {windows-video-cards} -{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer: CPUを利用します ''' stderr.unix = ''' -{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer: CPUを利用します ''' [user_dict_manipulate]