From 632cb13e2d793b41e7db3d9e6516df32298a6cbd Mon Sep 17 00:00:00 2001 From: y-chan Date: Sat, 6 Jan 2024 18:20:35 +0900 Subject: [PATCH 01/20] remove contour and rename to talk xxx --- crates/voicevox_core/src/publish.rs | 106 ------------------ crates/voicevox_core/src/status.rs | 55 +-------- crates/voicevox_core/src/status/model_file.rs | 13 +-- .../src/compatible_engine.rs | 27 ----- 4 files changed, 9 insertions(+), 192 deletions(-) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 5dbb333a9..496fc7d73 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -140,21 +140,6 @@ impl VoicevoxCore { ) } - pub fn predict_contour( - &mut self, - length: usize, - f0_discrete: &[f32], - phoneme_vector: &[i64], - speaker_id: u32, - ) -> Result> { - self.synthesis_engine.inference_core_mut().predict_contour( - length, - f0_discrete, - phoneme_vector, - speaker_id, - ) - } - pub fn decode( &mut self, length: usize, @@ -521,59 +506,6 @@ impl InferenceCore { status.predict_intonation_session_run(model_index, input_tensors) } - pub fn predict_contour( - &mut self, - length: usize, - f0_discrete: &[f32], - phoneme_vector: &[i64], - speaker_id: u32, - ) -> Result> { - if !self.initialized { - return Err(Error::UninitializedStatus); - } - - let status = self - .status_option - .as_mut() - .ok_or(Error::UninitializedStatus)?; - - if !status.validate_speaker_id(speaker_id) { - return Err(Error::InvalidSpeakerId { speaker_id }); - } - - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; - - if model_index >= MODEL_FILE_SET.models_count() { - return Err(Error::InvalidModelIndex { model_index }); - } - - let mut f0_discrete_array = - NdArray::new(ndarray::arr1(f0_discrete).into_shape([length, 1]).unwrap()); - let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector)); - let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); - - let input_tensors: Vec<&mut dyn AnyArray> = vec![ - &mut f0_discrete_array, - &mut phoneme_vector_array, - &mut speaker_id_array, - ]; - - let (mut f0_contour, voiced) = - status.predict_contour_session_run(model_index, input_tensors)?; - for (f0_contour_item, voiced_item) in f0_contour.iter_mut().zip(voiced.iter()) { - if *voiced_item < 0.0 { - *f0_contour_item = 0.0; - } - } - - Ok(f0_contour) - } - pub fn decode( &mut self, length: usize, @@ -987,44 +919,6 @@ mod tests { assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len()); } - #[rstest] - fn predict_contour_works() { - let internal = VoicevoxCore::new_with_mutex(); - internal - .lock() - .unwrap() - .initialize(InitializeOptions { - load_all_models: true, - acceleration_mode: AccelerationMode::Cpu, - ..Default::default() - }) - .unwrap(); - - // 「テスト」という文章に対応する入力 - const F0_LENGTH: usize = 69; - let mut f0_discrete = [0.; F0_LENGTH]; - f0_discrete[9..24].fill(5.905218); - f0_discrete[37..60].fill(5.565851); - - let mut phoneme = [0; F0_LENGTH]; - phoneme[0..9].fill(0); - phoneme[9..13].fill(37); - phoneme[13..24].fill(14); - phoneme[24..30].fill(35); - phoneme[30..37].fill(6); - phoneme[37..45].fill(37); - phoneme[45..60].fill(30); - phoneme[60..69].fill(0); - - let result = internal - .lock() - .unwrap() - .predict_contour(F0_LENGTH, &f0_discrete, &phoneme, 2); - - assert!(result.is_ok(), "{result:?}"); - assert_eq!(result.unwrap().len(), F0_LENGTH); - } - #[rstest] fn decode_works() { let internal = VoicevoxCore::new_with_mutex(); diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 81bd74509..96c984385 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -40,7 +40,6 @@ pub struct Status { struct StatusModels { predict_duration: BTreeMap>, predict_intonation: BTreeMap>, - predict_contour: BTreeMap>>, decode: BTreeMap>, } @@ -53,7 +52,7 @@ struct SessionOptions { pub(crate) struct ModelFileSet { pub(crate) speaker_id_map: BTreeMap, pub(crate) metas_str: String, - models: Vec, + models: Vec, } impl ModelFileSet { @@ -77,25 +76,20 @@ impl ModelFileSet { let metas_str = fs_err::read_to_string(path("metas.json"))?; - let models = model_file::MODEL_FILE_NAMES + let models = model_file::TALK_MODEL_FILE_NAMES .iter() .map( - |&ModelFileNames { + |&TalkModelFileNames { predict_duration_model, predict_intonation_model, - predict_contour_model, decode_model, }| { let predict_duration_model = ModelFile::new(&path(predict_duration_model))?; let predict_intonation_model = ModelFile::new(&path(predict_intonation_model))?; - let predict_contour_model = predict_contour_model - .map(|s| ModelFile::new(&path(s))) - .transpose()?; let decode_model = ModelFile::new(&path(decode_model))?; - Ok(Model { + Ok(TalkModel { predict_duration_model, predict_intonation_model, - predict_contour_model, decode_model, }) }, @@ -116,10 +110,9 @@ impl ModelFileSet { } } -struct ModelFileNames { +struct TalkModelFileNames { predict_duration_model: &'static str, predict_intonation_model: &'static str, - predict_contour_model: Option<&'static str>, decode_model: &'static str, } @@ -127,10 +120,9 @@ struct ModelFileNames { #[error("不正なモデルファイルです")] struct DecryptModelError; -struct Model { +struct TalkModel { predict_duration_model: ModelFile, predict_intonation_model: ModelFile, - predict_contour_model: Option, decode_model: ModelFile, } @@ -216,7 +208,6 @@ impl Status { models: StatusModels { predict_duration: BTreeMap::new(), predict_intonation: BTreeMap::new(), - predict_contour: BTreeMap::new(), decode: BTreeMap::new(), }, light_session_options: SessionOptions::new(cpu_num_threads, false), @@ -245,11 +236,6 @@ impl Status { self.new_session(&model.predict_duration_model, &self.light_session_options)?; let predict_intonation_session = self.new_session(&model.predict_intonation_model, &self.light_session_options)?; - let predict_contour_session = if let Some(model) = &model.predict_contour_model { - Some(self.new_session(model, &self.light_session_options)?) - } else { - None - }; let decode_model = self.new_session(&model.decode_model, &self.heavy_session_options)?; @@ -259,9 +245,6 @@ impl Status { self.models .predict_intonation .insert(model_index, predict_intonation_session); - self.models - .predict_contour - .insert(model_index, predict_contour_session); self.models.decode.insert(model_index, decode_model); @@ -274,7 +257,6 @@ impl Status { pub fn is_model_loaded(&self, model_index: usize) -> bool { self.models.predict_duration.contains_key(&model_index) && self.models.predict_intonation.contains_key(&model_index) - && self.models.predict_contour.contains_key(&model_index) && self.models.decode.contains_key(&model_index) } @@ -356,29 +338,6 @@ impl Status { } } - pub fn predict_contour_session_run( - &mut self, - model_index: usize, - inputs: Vec<&mut dyn AnyArray>, - ) -> Result<(Vec, Vec)> { - if let Some(model) = self.models.predict_contour.get_mut(&model_index) { - if let Some(model) = model { - if let Ok(output_tensors) = model.run(inputs) { - Ok(( - output_tensors[0].as_slice().unwrap().to_owned(), - output_tensors[1].as_slice().unwrap().to_owned(), - )) - } else { - Err(Error::InferenceFailed) - } - } else { - Err(Error::UnsupportedModel) - } - } else { - Err(Error::InvalidModelIndex { model_index }) - } - } - pub fn decode_session_run( &mut self, model_index: usize, @@ -425,7 +384,6 @@ mod tests { ); assert!(status.models.predict_duration.is_empty()); assert!(status.models.predict_intonation.is_empty()); - assert!(status.models.predict_contour.is_empty()); assert!(status.models.decode.is_empty()); assert!(status.supported_styles.is_empty()); } @@ -453,7 +411,6 @@ mod tests { assert_debug_fmt_eq!(Ok(()), result); assert_eq!(1, status.models.predict_duration.len()); assert_eq!(1, status.models.predict_intonation.len()); - assert_eq!(1, status.models.predict_contour.len()); assert_eq!(1, status.models.decode.len()); } diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs index aa618be76..c2cfd4dc3 100644 --- a/crates/voicevox_core/src/status/model_file.rs +++ b/crates/voicevox_core/src/status/model_file.rs @@ -1,4 +1,4 @@ -use super::{DecryptModelError, ModelFileNames}; +use super::{DecryptModelError, TalkModelFileNames}; pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptModelError> { Ok(content.to_owned()) @@ -7,17 +7,10 @@ pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptMod pub(super) const SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1)), (2, (1, 0)), (3, (1, 1))]; -pub(super) const MODEL_FILE_NAMES: &[ModelFileNames] = &[ - ModelFileNames { +pub(super) const TALK_MODEL_FILE_NAMES: &[TalkModelFileNames] = &[ + TalkModelFileNames { predict_duration_model: "predict_duration-0.onnx", predict_intonation_model: "predict_intonation-0.onnx", - predict_contour_model: None, decode_model: "decode-0.onnx", }, - ModelFileNames { - predict_duration_model: "predict_duration-1.onnx", - predict_intonation_model: "predict_intonation-1.onnx", - predict_contour_model: Some("predict_contour-1.onnx"), - decode_model: "decode-1.onnx", - }, ]; diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs index 02a3b4efb..23b446072 100644 --- a/crates/voicevox_core_c_api/src/compatible_engine.rs +++ b/crates/voicevox_core_c_api/src/compatible_engine.rs @@ -127,33 +127,6 @@ pub extern "C" fn yukarin_sa_forward( } } -#[no_mangle] -pub extern "C" fn yukarin_sosf_forward( - length: i64, - f0_discrete: *mut f32, - phoneme: *mut i64, - speaker_id: *mut i64, - output: *mut f32, -) -> bool { - let result = lock_internal().predict_contour( - length as usize, - unsafe { std::slice::from_raw_parts(f0_discrete, length as usize) }, - unsafe { std::slice::from_raw_parts(phoneme, length as usize) }, - unsafe { *speaker_id as u32 }, - ); - match result { - Ok(output_vec) => { - let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) }; - output_slice.clone_from_slice(&output_vec); - true - } - Err(err) => { - set_message(&format!("{err}")); - false - } - } -} - #[no_mangle] pub extern "C" fn decode_forward( length: i64, From 7b6df240bc2709d42c55ac22794b47905545c25e Mon Sep 17 00:00:00 2001 From: y-chan Date: Sun, 7 Jan 2024 01:01:09 +0900 Subject: [PATCH 02/20] fix speaker id map --- crates/voicevox_core/src/status/model_file.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs index c2cfd4dc3..0b6b6f336 100644 --- a/crates/voicevox_core/src/status/model_file.rs +++ b/crates/voicevox_core/src/status/model_file.rs @@ -5,7 +5,7 @@ pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptMod } pub(super) const SPEAKER_ID_MAP: &[(u32, (usize, u32))] = - &[(0, (0, 0)), (1, (0, 1)), (2, (1, 0)), (3, (1, 1))]; + &[(0, (0, 0)), (1, (0, 1))]; pub(super) const TALK_MODEL_FILE_NAMES: &[TalkModelFileNames] = &[ TalkModelFileNames { From 720609d549e908228148fc735999a696d8c90a0d Mon Sep 17 00:00:00 2001 From: y-chan Date: Sun, 7 Jan 2024 01:57:22 +0900 Subject: [PATCH 03/20] rename functions and variables --- crates/voicevox_core/src/publish.rs | 16 ++--- crates/voicevox_core/src/status.rs | 68 +++++++++---------- crates/voicevox_core/src/status/model_file.rs | 3 +- 3 files changed, 43 insertions(+), 44 deletions(-) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 496fc7d73..875c364c4 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -344,8 +344,8 @@ impl InferenceCore { status.load_metas()?; if load_all_models { - for model_index in 0..MODEL_FILE_SET.models_count() { - status.load_model(model_index)?; + for model_index in 0..MODEL_FILE_SET.talk_models_count() { + status.load_talk_model(model_index)?; } } @@ -374,7 +374,7 @@ impl InferenceCore { .as_mut() .ok_or(Error::UninitializedStatus)?; if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) { - status.load_model(model_index) + status.load_talk_model(model_index) } else { Err(Error::InvalidSpeakerId { speaker_id }) } @@ -385,7 +385,7 @@ impl InferenceCore { pub fn is_model_loaded(&self, speaker_id: u32) -> bool { if let Some(status) = self.status_option.as_ref() { if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) { - status.is_model_loaded(model_index) + status.is_talk_model_loaded(model_index) } else { false } @@ -423,7 +423,7 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); }; - if model_index >= MODEL_FILE_SET.models_count() { + if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -476,7 +476,7 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); }; - if model_index >= MODEL_FILE_SET.models_count() { + if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -534,7 +534,7 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); }; - if model_index >= MODEL_FILE_SET.models_count() { + if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -638,7 +638,7 @@ pub static SUPPORTED_DEVICES_CSTRING: Lazy = Lazy::new(|| CString::new(SUPPORTED_DEVICES.to_json().to_string()).unwrap()); fn get_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { - MODEL_FILE_SET.speaker_id_map.get(&speaker_id).copied() + MODEL_FILE_SET.talk_speaker_id_map.get(&speaker_id).copied() } pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static str { diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 96c984385..f998dc726 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -31,7 +31,7 @@ pub(crate) static MODEL_FILE_SET: Lazy = Lazy::new(|| { }); pub struct Status { - models: StatusModels, + talk_models: StatusModels, light_session_options: SessionOptions, // 軽いモデルはこちらを使う heavy_session_options: SessionOptions, // 重いモデルはこちらを使う supported_styles: BTreeSet, @@ -50,9 +50,9 @@ struct SessionOptions { } pub(crate) struct ModelFileSet { - pub(crate) speaker_id_map: BTreeMap, + pub(crate) talk_speaker_id_map: BTreeMap, pub(crate) metas_str: String, - models: Vec, + talk_models: Vec, } impl ModelFileSet { @@ -76,7 +76,7 @@ impl ModelFileSet { let metas_str = fs_err::read_to_string(path("metas.json"))?; - let models = model_file::TALK_MODEL_FILE_NAMES + let talk_models = model_file::TALK_MODEL_FILE_NAMES .iter() .map( |&TalkModelFileNames { @@ -97,16 +97,16 @@ impl ModelFileSet { .collect::>()?; return Ok(Self { - speaker_id_map: model_file::SPEAKER_ID_MAP.iter().copied().collect(), + talk_speaker_id_map: model_file::TALK_SPEAKER_ID_MAP.iter().copied().collect(), metas_str, - models, + talk_models, }); const ROOT_DIR_ENV_NAME: &str = "VV_MODELS_ROOT_DIR"; } - pub(crate) fn models_count(&self) -> usize { - self.models.len() + pub(crate) fn talk_models_count(&self) -> usize { + self.talk_models.len() } } @@ -205,7 +205,7 @@ unsafe impl Send for Status {} impl Status { pub fn new(use_gpu: bool, cpu_num_threads: u16) -> Self { Self { - models: StatusModels { + talk_models: StatusModels { predict_duration: BTreeMap::new(), predict_intonation: BTreeMap::new(), decode: BTreeMap::new(), @@ -229,9 +229,9 @@ impl Status { Ok(()) } - pub fn load_model(&mut self, model_index: usize) -> Result<()> { - if model_index < MODEL_FILE_SET.models.len() { - let model = &MODEL_FILE_SET.models[model_index]; + pub fn load_talk_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.talk_models.len() { + let model = &MODEL_FILE_SET.talk_models[model_index]; let predict_duration_session = self.new_session(&model.predict_duration_model, &self.light_session_options)?; let predict_intonation_session = @@ -239,14 +239,14 @@ impl Status { let decode_model = self.new_session(&model.decode_model, &self.heavy_session_options)?; - self.models + self.talk_models .predict_duration .insert(model_index, predict_duration_session); - self.models + self.talk_models .predict_intonation .insert(model_index, predict_intonation_session); - self.models.decode.insert(model_index, decode_model); + self.talk_models.decode.insert(model_index, decode_model); Ok(()) } else { @@ -254,10 +254,10 @@ impl Status { } } - pub fn is_model_loaded(&self, model_index: usize) -> bool { - self.models.predict_duration.contains_key(&model_index) - && self.models.predict_intonation.contains_key(&model_index) - && self.models.decode.contains_key(&model_index) + pub fn is_talk_model_loaded(&self, model_index: usize) -> bool { + self.talk_models.predict_duration.contains_key(&model_index) + && self.talk_models.predict_intonation.contains_key(&model_index) + && self.talk_models.decode.contains_key(&model_index) } fn new_session( @@ -311,7 +311,7 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.predict_duration.get_mut(&model_index) { + if let Some(model) = self.talk_models.predict_duration.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -327,7 +327,7 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.predict_intonation.get_mut(&model_index) { + if let Some(model) = self.talk_models.predict_intonation.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -343,7 +343,7 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.decode.get_mut(&model_index) { + if let Some(model) = self.talk_models.decode.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -382,9 +382,9 @@ mod tests { cpu_num_threads, status.heavy_session_options.cpu_num_threads ); - assert!(status.models.predict_duration.is_empty()); - assert!(status.models.predict_intonation.is_empty()); - assert!(status.models.decode.is_empty()); + assert!(status.talk_models.predict_duration.is_empty()); + assert!(status.talk_models.predict_intonation.is_empty()); + assert!(status.talk_models.decode.is_empty()); assert!(status.supported_styles.is_empty()); } @@ -405,27 +405,27 @@ mod tests { } #[rstest] - fn status_load_model_works() { + fn status_load_talk_model_works() { let mut status = Status::new(false, 0); - let result = status.load_model(0); + let result = status.load_talk_model(0); assert_debug_fmt_eq!(Ok(()), result); - assert_eq!(1, status.models.predict_duration.len()); - assert_eq!(1, status.models.predict_intonation.len()); - assert_eq!(1, status.models.decode.len()); + assert_eq!(1, status.talk_models.predict_duration.len()); + assert_eq!(1, status.talk_models.predict_intonation.len()); + assert_eq!(1, status.talk_models.decode.len()); } #[rstest] - fn status_is_model_loaded_works() { + fn status_is_talk_model_loaded_works() { let mut status = Status::new(false, 0); let model_index = 0; assert!( - !status.is_model_loaded(model_index), + !status.is_talk_model_loaded(model_index), "model should not be loaded" ); - let result = status.load_model(model_index); + let result = status.load_talk_model(model_index); assert_debug_fmt_eq!(Ok(()), result); assert!( - status.is_model_loaded(model_index), + status.is_talk_model_loaded(model_index), "model should be loaded" ); } diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs index 0b6b6f336..c938a1489 100644 --- a/crates/voicevox_core/src/status/model_file.rs +++ b/crates/voicevox_core/src/status/model_file.rs @@ -4,8 +4,7 @@ pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptMod Ok(content.to_owned()) } -pub(super) const SPEAKER_ID_MAP: &[(u32, (usize, u32))] = - &[(0, (0, 0)), (1, (0, 1))]; +pub(super) const TALK_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; pub(super) const TALK_MODEL_FILE_NAMES: &[TalkModelFileNames] = &[ TalkModelFileNames { From 1c1b46e787c225475228ce126ba04afb1b784f2f Mon Sep 17 00:00:00 2001 From: y-chan Date: Sun, 7 Jan 2024 02:05:52 +0900 Subject: [PATCH 04/20] add models to model file --- crates/voicevox_core/src/status.rs | 10 +++++++++ crates/voicevox_core/src/status/model_file.rs | 22 ++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index f998dc726..aa0151c75 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -116,6 +116,16 @@ struct TalkModelFileNames { decode_model: &'static str, } +struct SingStyleModelFileNames { + predict_sing_consonant_length_model: &'static str, + predict_sing_f0_model: &'static str, + predict_sing_volume_model: &'static str, +} + +struct SourceFilterModelFileNames { + source_filter_decode_model: &'static str, +} + #[derive(thiserror::Error, Debug)] #[error("不正なモデルファイルです")] struct DecryptModelError; diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs index c938a1489..d1d66d3bf 100644 --- a/crates/voicevox_core/src/status/model_file.rs +++ b/crates/voicevox_core/src/status/model_file.rs @@ -1,4 +1,4 @@ -use super::{DecryptModelError, TalkModelFileNames}; +use super::{DecryptModelError, TalkModelFileNames, SingStyleModelFileNames, SourceFilterModelFileNames}; pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptModelError> { Ok(content.to_owned()) @@ -13,3 +13,23 @@ pub(super) const TALK_MODEL_FILE_NAMES: &[TalkModelFileNames] = &[ decode_model: "decode-0.onnx", }, ]; + +// TODO: 変更する +pub(super) const SING_STYLE_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; + +pub(super) const SING_STYLE_MODEL_FILE_NAMES: &[SingStyleModelFileNames] = &[ + SingStyleModelFileNames { + predict_sing_consonant_length_model: "predict_duration-1.onnx", + predict_sing_f0_model: "predict_intonation-1.onnx", + predict_sing_volume_model: "predict_intonation-1.onnx", + }, +]; + +pub(super) const SOURCE_FILTER_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; + +pub(super) const SOURCE_FILTER_MODEL_FILE_NAMES: &[SourceFilterModelFileNames] = &[ + SourceFilterModelFileNames { + source_filter_decode_model: "decode-1.onnx", + }, +]; + From 77fb4bc19ac2403cc984874ee5e4b0af1827090e Mon Sep 17 00:00:00 2001 From: y-chan Date: Sun, 7 Jan 2024 02:16:53 +0900 Subject: [PATCH 05/20] add sing style and source filter models to model file set --- crates/voicevox_core/src/status.rs | 60 ++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index aa0151c75..195e5ace2 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -51,8 +51,12 @@ struct SessionOptions { pub(crate) struct ModelFileSet { pub(crate) talk_speaker_id_map: BTreeMap, + pub(crate) sing_style_speaker_id_map: BTreeMap, + pub(crate) source_filter_speaker_id_map: BTreeMap, pub(crate) metas_str: String, talk_models: Vec, + sing_style_models: Vec, + source_filter_models: Vec, } impl ModelFileSet { @@ -96,10 +100,48 @@ impl ModelFileSet { ) .collect::>()?; + let sing_style_models = model_file::SING_STYLE_MODEL_FILE_NAMES + .iter() + .map( + |&SingStyleModelFileNames { + predict_sing_consonant_length_model, + predict_sing_f0_model, + predict_sing_volume_model, + }| { + let predict_sing_consonant_length_model = ModelFile::new(&path(predict_sing_consonant_length_model))?; + let predict_sing_f0_model = ModelFile::new(&path(predict_sing_f0_model))?; + let predict_sing_volume_model = ModelFile::new(&path(predict_sing_volume_model))?; + Ok(SingStyleModel { + predict_sing_consonant_length_model, + predict_sing_f0_model, + predict_sing_volume_model, + }) + }, + ) + .collect::>()?; + + let source_filter_models = model_file::SOURCE_FILTER_MODEL_FILE_NAMES + .iter() + .map( + |&SourceFilterModelFileNames { + source_filter_decode_model, + }| { + let source_filter_decode_model = ModelFile::new(&path(source_filter_decode_model))?; + Ok(SourceFilterModel { + source_filter_decode_model, + }) + }, + ) + .collect::>()?; + return Ok(Self { talk_speaker_id_map: model_file::TALK_SPEAKER_ID_MAP.iter().copied().collect(), + sing_style_speaker_id_map: model_file::SING_STYLE_SPEAKER_ID_MAP.iter().copied().collect(), + source_filter_speaker_id_map: model_file::SOURCE_FILTER_SPEAKER_ID_MAP.iter().copied().collect(), metas_str, talk_models, + sing_style_models, + source_filter_models, }); const ROOT_DIR_ENV_NAME: &str = "VV_MODELS_ROOT_DIR"; @@ -108,6 +150,14 @@ impl ModelFileSet { pub(crate) fn talk_models_count(&self) -> usize { self.talk_models.len() } + + pub(crate) fn sing_style_models_count(&self) -> usize { + self.sing_style_models.len() + } + + pub(crate) fn source_filter_models_count(&self) -> usize { + self.source_filter_models.len() + } } struct TalkModelFileNames { @@ -136,6 +186,16 @@ struct TalkModel { decode_model: ModelFile, } +struct SingStyleModel { + predict_sing_consonant_length_model: ModelFile, + predict_sing_f0_model: ModelFile, + predict_sing_volume_model: ModelFile, +} + +struct SourceFilterModel { + source_filter_decode_model: ModelFile, +} + struct ModelFile { path: PathBuf, content: Vec, From 278e4f9980a647cabe8ab3b93fc5631e38ce84d1 Mon Sep 17 00:00:00 2001 From: y-chan Date: Sun, 7 Jan 2024 02:22:52 +0900 Subject: [PATCH 06/20] add new models to status --- crates/voicevox_core/src/status.rs | 89 +++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 3 deletions(-) diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 195e5ace2..25935de6e 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -31,18 +31,30 @@ pub(crate) static MODEL_FILE_SET: Lazy = Lazy::new(|| { }); pub struct Status { - talk_models: StatusModels, + talk_models: StatusTalkModels, + sing_style_models: StatusSingStyleModels, + source_filter_models: StatusSourceFilterModels, light_session_options: SessionOptions, // 軽いモデルはこちらを使う heavy_session_options: SessionOptions, // 重いモデルはこちらを使う supported_styles: BTreeSet, } -struct StatusModels { +struct StatusTalkModels { predict_duration: BTreeMap>, predict_intonation: BTreeMap>, decode: BTreeMap>, } +struct StatusSingStyleModels { + predict_sing_consonant_length: BTreeMap>, + predict_sing_f0: BTreeMap>, + predict_sing_volume: BTreeMap>, +} + +struct StatusSourceFilterModels { + source_filter_decode: BTreeMap>, +} + #[derive(new, Getters)] struct SessionOptions { cpu_num_threads: u16, @@ -275,11 +287,19 @@ unsafe impl Send for Status {} impl Status { pub fn new(use_gpu: bool, cpu_num_threads: u16) -> Self { Self { - talk_models: StatusModels { + talk_models: StatusTalkModels { predict_duration: BTreeMap::new(), predict_intonation: BTreeMap::new(), decode: BTreeMap::new(), }, + sing_style_models: StatusSingStyleModels { + predict_sing_consonant_length: BTreeMap::new(), + predict_sing_f0: BTreeMap::new(), + predict_sing_volume: BTreeMap::new(), + }, + source_filter_models: StatusSourceFilterModels { + source_filter_decode: BTreeMap::new(), + }, light_session_options: SessionOptions::new(cpu_num_threads, false), heavy_session_options: SessionOptions::new(cpu_num_threads, use_gpu), supported_styles: BTreeSet::default(), @@ -330,6 +350,69 @@ impl Status { && self.talk_models.decode.contains_key(&model_index) } + pub fn load_sing_style_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.sing_style_models.len() { + let model = &MODEL_FILE_SET.sing_style_models[model_index]; + let predict_sing_consonant_length_session = + self.new_session(&model.predict_sing_consonant_length_model, &self.light_session_options)?; + let predict_sing_f0_session = + self.new_session(&model.predict_sing_f0_model, &self.light_session_options)?; + let predict_sing_volume_session = + self.new_session(&model.predict_sing_volume_model, &self.light_session_options)?; + + self.sing_style_models.predict_sing_consonant_length.insert( + model_index, + predict_sing_consonant_length_session, + ); + self.sing_style_models + .predict_sing_f0 + .insert(model_index, predict_sing_f0_session); + self.sing_style_models + .predict_sing_volume + .insert(model_index, predict_sing_volume_session); + + Ok(()) + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn is_sing_style_model_loaded(&self, model_index: usize) -> bool { + self.sing_style_models + .predict_sing_consonant_length + .contains_key(&model_index) + && self + .sing_style_models + .predict_sing_f0 + .contains_key(&model_index) + && self + .sing_style_models + .predict_sing_volume + .contains_key(&model_index) + } + + pub fn load_source_filter_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.source_filter_models.len() { + let model = &MODEL_FILE_SET.source_filter_models[model_index]; + let source_filter_decode_session = + self.new_session(&model.source_filter_decode_model, &self.heavy_session_options)?; + + self.source_filter_models + .source_filter_decode + .insert(model_index, source_filter_decode_session); + + Ok(()) + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn is_source_filter_model_loaded(&self, model_index: usize) -> bool { + self.source_filter_models + .source_filter_decode + .contains_key(&model_index) + } + fn new_session( &self, model_file: &ModelFile, From ef83b525ef0c708191ed529a8a2a3f90abdbe922 Mon Sep 17 00:00:00 2001 From: y-chan Date: Sun, 7 Jan 2024 02:27:06 +0900 Subject: [PATCH 07/20] rename get model index and speaker id --- crates/voicevox_core/src/publish.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 875c364c4..edc8f9e4a 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -384,7 +384,7 @@ impl InferenceCore { } pub fn is_model_loaded(&self, speaker_id: u32) -> bool { if let Some(status) = self.status_option.as_ref() { - if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, _)) = get_talk_model_index_and_speaker_id(speaker_id) { status.is_talk_model_loaded(model_index) } else { false @@ -417,7 +417,7 @@ impl InferenceCore { } let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, speaker_id)) = get_talk_model_index_and_speaker_id(speaker_id) { (model_index, speaker_id) } else { return Err(Error::InvalidSpeakerId { speaker_id }); @@ -470,7 +470,7 @@ impl InferenceCore { } let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, speaker_id)) = get_talk_model_index_and_speaker_id(speaker_id) { (model_index, speaker_id) } else { return Err(Error::InvalidSpeakerId { speaker_id }); @@ -528,7 +528,7 @@ impl InferenceCore { } let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, speaker_id)) = get_talk_model_index_and_speaker_id(speaker_id) { (model_index, speaker_id) } else { return Err(Error::InvalidSpeakerId { speaker_id }); @@ -637,7 +637,7 @@ pub static SUPPORTED_DEVICES: Lazy = pub static SUPPORTED_DEVICES_CSTRING: Lazy = Lazy::new(|| CString::new(SUPPORTED_DEVICES.to_json().to_string()).unwrap()); -fn get_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { +fn get_talk_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { MODEL_FILE_SET.talk_speaker_id_map.get(&speaker_id).copied() } @@ -851,7 +851,7 @@ mod tests { #[case] speaker_id: u32, #[case] expected: Option<(usize, u32)>, ) { - let actual = get_model_index_and_speaker_id(speaker_id); + let actual = get_talk_model_index_and_speaker_id(speaker_id); assert_eq!(expected, actual); } From 19e1e0ec0ece2c2e1bbba8dbeedc5be885bb9911 Mon Sep 17 00:00:00 2001 From: y-chan Date: Sun, 7 Jan 2024 02:45:22 +0900 Subject: [PATCH 08/20] add new models session --- crates/voicevox_core/src/status.rs | 76 ++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 25935de6e..278d4ca07 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -506,6 +506,82 @@ impl Status { Err(Error::InvalidModelIndex { model_index }) } } + + pub fn predict_sing_consonant_length_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self + .sing_style_models + .predict_sing_consonant_length + .get_mut(&model_index) + { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn predict_sing_f0_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self.sing_style_models.predict_sing_f0.get_mut(&model_index) { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn predict_sing_volume_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self + .sing_style_models + .predict_sing_volume + .get_mut(&model_index) + { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn source_filter_decode_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self + .source_filter_models + .source_filter_decode + .get_mut(&model_index) + { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } } #[cfg(test)] From 37952bc6aa6cd34402a611c20118daabf350ae9a Mon Sep 17 00:00:00 2001 From: y-chan Date: Sun, 7 Jan 2024 02:52:49 +0900 Subject: [PATCH 09/20] change i32 to i64 --- crates/voicevox_core/src/status.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 278d4ca07..04955f80d 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -511,7 +511,7 @@ impl Status { &mut self, model_index: usize, inputs: Vec<&mut dyn AnyArray>, - ) -> Result> { + ) -> Result> { if let Some(model) = self .sing_style_models .predict_sing_consonant_length From 3c713a46abd29766b28435a5bcdea01bec5b5cd7 Mon Sep 17 00:00:00 2001 From: y-chan Date: Sun, 7 Jan 2024 02:59:25 +0900 Subject: [PATCH 10/20] add new predictor to inference core --- crates/voicevox_core/src/publish.rs | 209 +++++++++++++++++++++++++++- 1 file changed, 206 insertions(+), 3 deletions(-) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index edc8f9e4a..97d388bba 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -347,6 +347,12 @@ impl InferenceCore { for model_index in 0..MODEL_FILE_SET.talk_models_count() { status.load_talk_model(model_index)?; } + for model_index in 0..MODEL_FILE_SET.sing_style_models_count() { + status.load_sing_style_model(model_index)?; + } + for model_index in 0..MODEL_FILE_SET.source_filter_models_count() { + status.load_source_filter_model(model_index)?; + } } self.status_option = Some(status); @@ -373,10 +379,25 @@ impl InferenceCore { .status_option .as_mut() .ok_or(Error::UninitializedStatus)?; - if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, _)) = get_talk_model_index_and_speaker_id(speaker_id) { status.load_talk_model(model_index) } else { - Err(Error::InvalidSpeakerId { speaker_id }) + // ハミング機能及び歌機能モデルはどちらかが存在しない事があるので、どちらかが存在しない場合でも無視する + let mut loaded = false; + if let Some((model_index, _)) = get_sing_style_model_index_and_speaker_id(speaker_id) { + status.load_sing_style_model(model_index)?; + loaded = true; + } + if let Some((model_index, _)) = get_source_filter_model_index_and_speaker_id(speaker_id) { + status.load_source_filter_model(model_index)?; + loaded = true; + } + + if loaded { + Ok(()) + } else { + Err(Error::InvalidSpeakerId { speaker_id }) + } } } else { Err(Error::UninitializedStatus) @@ -387,7 +408,15 @@ impl InferenceCore { if let Some((model_index, _)) = get_talk_model_index_and_speaker_id(speaker_id) { status.is_talk_model_loaded(model_index) } else { - false + // ハミング機能及び歌機能モデルはどちらかが存在しない事があるので、どちらかが存在しない場合でも無視する + let mut loaded = false; + if let Some((model_index, _)) = get_sing_style_model_index_and_speaker_id(speaker_id) { + loaded |= status.is_sing_style_model_loaded(model_index); + } + if let Some((model_index, _)) = get_source_filter_model_index_and_speaker_id(speaker_id) { + loaded |= status.is_source_filter_model_loaded(model_index); + } + loaded } } else { false @@ -574,6 +603,172 @@ impl InferenceCore { .map(|output| Self::trim_padding_from_output(output, padding_size)) } + pub fn predict_sing_consonant_length( + &mut self, + consonant_vector: &[i64], + vowel_vector: &[i64], + note_duration_vector: &[i64], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = + if let Some((model_index, speaker_id)) = get_sing_style_model_index_and_speaker_id(speaker_id) { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sing_style_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut consonant_vector_array = NdArray::new(ndarray::arr1(consonant_vector)); + let mut vowel_vector_array = NdArray::new(ndarray::arr1(vowel_vector)); + let mut note_duration_vector_array = NdArray::new(ndarray::arr1(note_duration_vector)); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = + vec![&mut consonant_vector_array, &mut vowel_vector_array, &mut note_duration_vector_array, &mut speaker_id_array]; + + status.predict_sing_consonant_length_session_run(model_index, input_tensors) + } + + pub fn predict_sing_f0( + &mut self, + phoneme_vector: &[i64], + note_vector: &[i64], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = + if let Some((model_index, speaker_id)) = get_sing_style_model_index_and_speaker_id(speaker_id) { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sing_style_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector)); + let mut note_vector_array = NdArray::new(ndarray::arr1(note_vector)); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = + vec![&mut phoneme_vector_array, &mut note_vector_array, &mut speaker_id_array]; + + status.predict_sing_f0_session_run(model_index, input_tensors) + } + + pub fn predict_sing_volume( + &mut self, + phoneme_vector: &[i64], + note_vector: &[i64], + _f0_vector: &[f32], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = + if let Some((model_index, speaker_id)) = get_sing_style_model_index_and_speaker_id(speaker_id) { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sing_style_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + // TODO: f0を使う + let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector)); + let mut note_vector_array = NdArray::new(ndarray::arr1(note_vector)); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = + vec![&mut phoneme_vector_array, &mut note_vector_array, &mut speaker_id_array]; + + status.predict_sing_volume_session_run(model_index, input_tensors) + } + + pub fn source_filter_decode( + &mut self, + phoneme_vector: &[i64], + f0: &[f32], + volume: &[f32], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = + if let Some((model_index, speaker_id)) = get_source_filter_model_index_and_speaker_id(speaker_id) { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.source_filter_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector)); + let mut f0_array = NdArray::new(ndarray::arr1(f0)); + let mut volume_array = NdArray::new(ndarray::arr1(volume)); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = + vec![&mut phoneme_vector_array, &mut f0_array, &mut volume_array, &mut speaker_id_array]; + + status.source_filter_decode_session_run(model_index, input_tensors) + } + fn make_f0_with_padding( f0_slice: &[f32], length_with_padding: usize, @@ -641,6 +836,14 @@ fn get_talk_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> MODEL_FILE_SET.talk_speaker_id_map.get(&speaker_id).copied() } +fn get_sing_style_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET.sing_style_speaker_id_map.get(&speaker_id).copied() +} + +fn get_source_filter_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET.source_filter_speaker_id_map.get(&speaker_id).copied() +} + pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static str { // C APIのため、messageには必ず末尾にNULL文字を追加する use VoicevoxResultCode::*; From 7948818763386da8c90f9b4fe7b5e0d8bb83a7be Mon Sep 17 00:00:00 2001 From: y-chan Date: Sun, 7 Jan 2024 03:01:51 +0900 Subject: [PATCH 11/20] add new predictor to core --- crates/voicevox_core/src/publish.rs | 52 +++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 97d388bba..4a06df5e6 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -157,6 +157,58 @@ impl VoicevoxCore { ) } + pub fn predict_sing_consonant_length( + &mut self, + consonant_vector: &[i64], + vowel_vector: &[i64], + note_duration_vector: &[i64], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .predict_sing_consonant_length( + consonant_vector, + vowel_vector, + note_duration_vector, + speaker_id, + ) + } + + pub fn predict_sing_f0( + &mut self, + phoneme_vector: &[i64], + note_vector: &[i64], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .predict_sing_f0(phoneme_vector, note_vector, speaker_id) + } + + pub fn predict_sing_volume( + &mut self, + phoneme_vector: &[i64], + note_vector: &[i64], + f0_vector: &[f32], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .predict_sing_volume(phoneme_vector, note_vector, f0_vector, speaker_id) + } + + pub fn source_filter_decode( + &mut self, + phoneme_vector: &[i64], + f0: &[f32], + volume: &[f32], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .source_filter_decode(phoneme_vector, f0, volume, speaker_id) + } + pub fn audio_query( &mut self, text: &str, From 23084890ce25b5921ffeaa2a3f661dee7249880a Mon Sep 17 00:00:00 2001 From: y-chan Date: Sun, 7 Jan 2024 03:10:03 +0900 Subject: [PATCH 12/20] add new predictor to compatible engine --- .../src/compatible_engine.rs | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs index 23b446072..808702945 100644 --- a/crates/voicevox_core_c_api/src/compatible_engine.rs +++ b/crates/voicevox_core_c_api/src/compatible_engine.rs @@ -157,3 +157,117 @@ pub extern "C" fn decode_forward( } } } + +#[no_mangle] +pub extern "C" fn predict_sing_consonant_length_forward( + length: i64, + consonant: *mut i64, + vowel: *mut i64, + note_duration: *mut i64, + speaker_id: *mut i64, + output: *mut i64, +) -> bool { + let length = length as usize; + let result = lock_internal().predict_sing_consonant_length( + unsafe { std::slice::from_raw_parts(consonant, length) }, + unsafe { std::slice::from_raw_parts(vowel, length) }, + unsafe { std::slice::from_raw_parts(note_duration, length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[no_mangle] +pub extern "C" fn predict_sing_f0_forward( + length: i64, + phoneme: *mut i64, + note: *mut i64, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + let length = length as usize; + let result = lock_internal().predict_sing_f0( + unsafe { std::slice::from_raw_parts(phoneme, length) }, + unsafe { std::slice::from_raw_parts(note, length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[no_mangle] +pub extern "C" fn predict_sing_volume_forward( + length: i64, + phoneme: *mut i64, + note: *mut i64, + f0: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + let length = length as usize; + let result = lock_internal().predict_sing_volume( + unsafe { std::slice::from_raw_parts(phoneme, length) }, + unsafe { std::slice::from_raw_parts(note, length) }, + unsafe { std::slice::from_raw_parts(f0, length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[no_mangle] +pub extern "C" fn source_filter_decode_forward( + length: i64, + phoneme: *mut i64, + f0: *mut f32, + volume: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + let length = length as usize; + let result = lock_internal().source_filter_decode( + unsafe { std::slice::from_raw_parts(phoneme, length) }, + unsafe { std::slice::from_raw_parts(f0, length) }, + unsafe { std::slice::from_raw_parts(volume, length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length * 256) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} From 813ba66764efc02846579f782f32378cc7af25e0 Mon Sep 17 00:00:00 2001 From: y-chan Date: Mon, 8 Jan 2024 21:59:50 +0900 Subject: [PATCH 13/20] rename source filter to sf decode --- crates/voicevox_core/src/publish.rs | 28 ++++---- crates/voicevox_core/src/status.rs | 70 +++++++++---------- crates/voicevox_core/src/status/model_file.rs | 8 +-- .../src/compatible_engine.rs | 4 +- 4 files changed, 55 insertions(+), 55 deletions(-) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 4a06df5e6..ea6fd6785 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -197,7 +197,7 @@ impl VoicevoxCore { .predict_sing_volume(phoneme_vector, note_vector, f0_vector, speaker_id) } - pub fn source_filter_decode( + pub fn sf_decode( &mut self, phoneme_vector: &[i64], f0: &[f32], @@ -206,7 +206,7 @@ impl VoicevoxCore { ) -> Result> { self.synthesis_engine .inference_core_mut() - .source_filter_decode(phoneme_vector, f0, volume, speaker_id) + .sf_decode(phoneme_vector, f0, volume, speaker_id) } pub fn audio_query( @@ -402,8 +402,8 @@ impl InferenceCore { for model_index in 0..MODEL_FILE_SET.sing_style_models_count() { status.load_sing_style_model(model_index)?; } - for model_index in 0..MODEL_FILE_SET.source_filter_models_count() { - status.load_source_filter_model(model_index)?; + for model_index in 0..MODEL_FILE_SET.sf_models_count() { + status.load_sf_decode_model(model_index)?; } } @@ -440,8 +440,8 @@ impl InferenceCore { status.load_sing_style_model(model_index)?; loaded = true; } - if let Some((model_index, _)) = get_source_filter_model_index_and_speaker_id(speaker_id) { - status.load_source_filter_model(model_index)?; + if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) { + status.load_sf_decode_model(model_index)?; loaded = true; } @@ -465,8 +465,8 @@ impl InferenceCore { if let Some((model_index, _)) = get_sing_style_model_index_and_speaker_id(speaker_id) { loaded |= status.is_sing_style_model_loaded(model_index); } - if let Some((model_index, _)) = get_source_filter_model_index_and_speaker_id(speaker_id) { - loaded |= status.is_source_filter_model_loaded(model_index); + if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) { + loaded |= status.is_sf_decode_model_loaded(model_index); } loaded } @@ -779,7 +779,7 @@ impl InferenceCore { status.predict_sing_volume_session_run(model_index, input_tensors) } - pub fn source_filter_decode( + pub fn sf_decode( &mut self, phoneme_vector: &[i64], f0: &[f32], @@ -800,13 +800,13 @@ impl InferenceCore { } let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_source_filter_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, speaker_id)) = get_sf_decode_model_index_and_speaker_id(speaker_id) { (model_index, speaker_id) } else { return Err(Error::InvalidSpeakerId { speaker_id }); }; - if model_index >= MODEL_FILE_SET.source_filter_models_count() { + if model_index >= MODEL_FILE_SET.sf_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -818,7 +818,7 @@ impl InferenceCore { let input_tensors: Vec<&mut dyn AnyArray> = vec![&mut phoneme_vector_array, &mut f0_array, &mut volume_array, &mut speaker_id_array]; - status.source_filter_decode_session_run(model_index, input_tensors) + status.sf_decode_session_run(model_index, input_tensors) } fn make_f0_with_padding( @@ -892,8 +892,8 @@ fn get_sing_style_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, MODEL_FILE_SET.sing_style_speaker_id_map.get(&speaker_id).copied() } -fn get_source_filter_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { - MODEL_FILE_SET.source_filter_speaker_id_map.get(&speaker_id).copied() +fn get_sf_decode_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET.sf_decode_speaker_id_map.get(&speaker_id).copied() } pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static str { diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 04955f80d..a418ca7b6 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -33,7 +33,7 @@ pub(crate) static MODEL_FILE_SET: Lazy = Lazy::new(|| { pub struct Status { talk_models: StatusTalkModels, sing_style_models: StatusSingStyleModels, - source_filter_models: StatusSourceFilterModels, + sf_decode_models: StatusSfModels, light_session_options: SessionOptions, // 軽いモデルはこちらを使う heavy_session_options: SessionOptions, // 重いモデルはこちらを使う supported_styles: BTreeSet, @@ -51,8 +51,8 @@ struct StatusSingStyleModels { predict_sing_volume: BTreeMap>, } -struct StatusSourceFilterModels { - source_filter_decode: BTreeMap>, +struct StatusSfModels { + sf_decode: BTreeMap>, } #[derive(new, Getters)] @@ -64,11 +64,11 @@ struct SessionOptions { pub(crate) struct ModelFileSet { pub(crate) talk_speaker_id_map: BTreeMap, pub(crate) sing_style_speaker_id_map: BTreeMap, - pub(crate) source_filter_speaker_id_map: BTreeMap, + pub(crate) sf_decode_speaker_id_map: BTreeMap, pub(crate) metas_str: String, talk_models: Vec, sing_style_models: Vec, - source_filter_models: Vec, + sf_decode_models: Vec, } impl ModelFileSet { @@ -132,15 +132,15 @@ impl ModelFileSet { ) .collect::>()?; - let source_filter_models = model_file::SOURCE_FILTER_MODEL_FILE_NAMES + let sf_decode_models = model_file::SOURCE_FILTER_MODEL_FILE_NAMES .iter() .map( - |&SourceFilterModelFileNames { - source_filter_decode_model, + |&SfModelFileNames { + sf_decode_model, }| { - let source_filter_decode_model = ModelFile::new(&path(source_filter_decode_model))?; - Ok(SourceFilterModel { - source_filter_decode_model, + let sf_decode_model = ModelFile::new(&path(sf_decode_model))?; + Ok(SfDecodeModel { + sf_decode_model, }) }, ) @@ -149,11 +149,11 @@ impl ModelFileSet { return Ok(Self { talk_speaker_id_map: model_file::TALK_SPEAKER_ID_MAP.iter().copied().collect(), sing_style_speaker_id_map: model_file::SING_STYLE_SPEAKER_ID_MAP.iter().copied().collect(), - source_filter_speaker_id_map: model_file::SOURCE_FILTER_SPEAKER_ID_MAP.iter().copied().collect(), + sf_decode_speaker_id_map: model_file::SOURCE_FILTER_SPEAKER_ID_MAP.iter().copied().collect(), metas_str, talk_models, sing_style_models, - source_filter_models, + sf_decode_models, }); const ROOT_DIR_ENV_NAME: &str = "VV_MODELS_ROOT_DIR"; @@ -167,8 +167,8 @@ impl ModelFileSet { self.sing_style_models.len() } - pub(crate) fn source_filter_models_count(&self) -> usize { - self.source_filter_models.len() + pub(crate) fn sf_models_count(&self) -> usize { + self.sf_decode_models.len() } } @@ -184,8 +184,8 @@ struct SingStyleModelFileNames { predict_sing_volume_model: &'static str, } -struct SourceFilterModelFileNames { - source_filter_decode_model: &'static str, +struct SfModelFileNames { + sf_decode_model: &'static str, } #[derive(thiserror::Error, Debug)] @@ -204,8 +204,8 @@ struct SingStyleModel { predict_sing_volume_model: ModelFile, } -struct SourceFilterModel { - source_filter_decode_model: ModelFile, +struct SfDecodeModel { + sf_decode_model: ModelFile, } struct ModelFile { @@ -297,8 +297,8 @@ impl Status { predict_sing_f0: BTreeMap::new(), predict_sing_volume: BTreeMap::new(), }, - source_filter_models: StatusSourceFilterModels { - source_filter_decode: BTreeMap::new(), + sf_decode_models: StatusSfModels { + sf_decode: BTreeMap::new(), }, light_session_options: SessionOptions::new(cpu_num_threads, false), heavy_session_options: SessionOptions::new(cpu_num_threads, use_gpu), @@ -391,15 +391,15 @@ impl Status { .contains_key(&model_index) } - pub fn load_source_filter_model(&mut self, model_index: usize) -> Result<()> { - if model_index < MODEL_FILE_SET.source_filter_models.len() { - let model = &MODEL_FILE_SET.source_filter_models[model_index]; - let source_filter_decode_session = - self.new_session(&model.source_filter_decode_model, &self.heavy_session_options)?; + pub fn load_sf_decode_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.sf_decode_models.len() { + let model = &MODEL_FILE_SET.sf_decode_models[model_index]; + let sf_decode_session = + self.new_session(&model.sf_decode_model, &self.heavy_session_options)?; - self.source_filter_models - .source_filter_decode - .insert(model_index, source_filter_decode_session); + self.sf_decode_models + .sf_decode + .insert(model_index, sf_decode_session); Ok(()) } else { @@ -407,9 +407,9 @@ impl Status { } } - pub fn is_source_filter_model_loaded(&self, model_index: usize) -> bool { - self.source_filter_models - .source_filter_decode + pub fn is_sf_decode_model_loaded(&self, model_index: usize) -> bool { + self.sf_decode_models + .sf_decode .contains_key(&model_index) } @@ -563,14 +563,14 @@ impl Status { } } - pub fn source_filter_decode_session_run( + pub fn sf_decode_session_run( &mut self, model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { if let Some(model) = self - .source_filter_models - .source_filter_decode + .sf_decode_models + .sf_decode .get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs index d1d66d3bf..d4ddc3a5e 100644 --- a/crates/voicevox_core/src/status/model_file.rs +++ b/crates/voicevox_core/src/status/model_file.rs @@ -1,4 +1,4 @@ -use super::{DecryptModelError, TalkModelFileNames, SingStyleModelFileNames, SourceFilterModelFileNames}; +use super::{DecryptModelError, TalkModelFileNames, SingStyleModelFileNames, SfModelFileNames}; pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptModelError> { Ok(content.to_owned()) @@ -27,9 +27,9 @@ pub(super) const SING_STYLE_MODEL_FILE_NAMES: &[SingStyleModelFileNames] = &[ pub(super) const SOURCE_FILTER_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; -pub(super) const SOURCE_FILTER_MODEL_FILE_NAMES: &[SourceFilterModelFileNames] = &[ - SourceFilterModelFileNames { - source_filter_decode_model: "decode-1.onnx", +pub(super) const SOURCE_FILTER_MODEL_FILE_NAMES: &[SfModelFileNames] = &[ + SfModelFileNames { + sf_decode_model: "decode-1.onnx", }, ]; diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs index 808702945..d919f72f0 100644 --- a/crates/voicevox_core_c_api/src/compatible_engine.rs +++ b/crates/voicevox_core_c_api/src/compatible_engine.rs @@ -244,7 +244,7 @@ pub extern "C" fn predict_sing_volume_forward( } #[no_mangle] -pub extern "C" fn source_filter_decode_forward( +pub extern "C" fn sf_decode_forward( length: i64, phoneme: *mut i64, f0: *mut f32, @@ -253,7 +253,7 @@ pub extern "C" fn source_filter_decode_forward( output: *mut f32, ) -> bool { let length = length as usize; - let result = lock_internal().source_filter_decode( + let result = lock_internal().sf_decode( unsafe { std::slice::from_raw_parts(phoneme, length) }, unsafe { std::slice::from_raw_parts(f0, length) }, unsafe { std::slice::from_raw_parts(volume, length) }, From 81356c06984f7ba8dc620f900e8ab5647530ad92 Mon Sep 17 00:00:00 2001 From: y-chan Date: Mon, 8 Jan 2024 22:01:27 +0900 Subject: [PATCH 14/20] fix rename miss --- crates/voicevox_core/src/publish.rs | 4 ++-- crates/voicevox_core/src/status.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index ea6fd6785..80afb1849 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -402,7 +402,7 @@ impl InferenceCore { for model_index in 0..MODEL_FILE_SET.sing_style_models_count() { status.load_sing_style_model(model_index)?; } - for model_index in 0..MODEL_FILE_SET.sf_models_count() { + for model_index in 0..MODEL_FILE_SET.sf_decode_models_count() { status.load_sf_decode_model(model_index)?; } } @@ -806,7 +806,7 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); }; - if model_index >= MODEL_FILE_SET.sf_models_count() { + if model_index >= MODEL_FILE_SET.sf_decode_models_count() { return Err(Error::InvalidModelIndex { model_index }); } diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index a418ca7b6..2ebc8e742 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -167,7 +167,7 @@ impl ModelFileSet { self.sing_style_models.len() } - pub(crate) fn sf_models_count(&self) -> usize { + pub(crate) fn sf_decode_models_count(&self) -> usize { self.sf_decode_models.len() } } From 790eeacad2ddde26120be0cedc4c7aea652e6ac9 Mon Sep 17 00:00:00 2001 From: y-chan Date: Mon, 8 Jan 2024 22:04:45 +0900 Subject: [PATCH 15/20] rename sing style to sing teacher --- crates/voicevox_core/src/publish.rs | 28 +++++----- crates/voicevox_core/src/status.rs | 54 +++++++++---------- crates/voicevox_core/src/status/model_file.rs | 6 +-- 3 files changed, 44 insertions(+), 44 deletions(-) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 80afb1849..917ec5156 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -399,8 +399,8 @@ impl InferenceCore { for model_index in 0..MODEL_FILE_SET.talk_models_count() { status.load_talk_model(model_index)?; } - for model_index in 0..MODEL_FILE_SET.sing_style_models_count() { - status.load_sing_style_model(model_index)?; + for model_index in 0..MODEL_FILE_SET.sing_teacher_models_count() { + status.load_sing_teacher_model(model_index)?; } for model_index in 0..MODEL_FILE_SET.sf_decode_models_count() { status.load_sf_decode_model(model_index)?; @@ -436,8 +436,8 @@ impl InferenceCore { } else { // ハミング機能及び歌機能モデルはどちらかが存在しない事があるので、どちらかが存在しない場合でも無視する let mut loaded = false; - if let Some((model_index, _)) = get_sing_style_model_index_and_speaker_id(speaker_id) { - status.load_sing_style_model(model_index)?; + if let Some((model_index, _)) = get_sing_teacher_model_index_and_speaker_id(speaker_id) { + status.load_sing_teacher_model(model_index)?; loaded = true; } if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) { @@ -462,8 +462,8 @@ impl InferenceCore { } else { // ハミング機能及び歌機能モデルはどちらかが存在しない事があるので、どちらかが存在しない場合でも無視する let mut loaded = false; - if let Some((model_index, _)) = get_sing_style_model_index_and_speaker_id(speaker_id) { - loaded |= status.is_sing_style_model_loaded(model_index); + if let Some((model_index, _)) = get_sing_teacher_model_index_and_speaker_id(speaker_id) { + loaded |= status.is_sing_teacher_model_loaded(model_index); } if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) { loaded |= status.is_sf_decode_model_loaded(model_index); @@ -676,13 +676,13 @@ impl InferenceCore { } let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_sing_style_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, speaker_id)) = get_sing_teacher_model_index_and_speaker_id(speaker_id) { (model_index, speaker_id) } else { return Err(Error::InvalidSpeakerId { speaker_id }); }; - if model_index >= MODEL_FILE_SET.sing_style_models_count() { + if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -717,13 +717,13 @@ impl InferenceCore { } let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_sing_style_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, speaker_id)) = get_sing_teacher_model_index_and_speaker_id(speaker_id) { (model_index, speaker_id) } else { return Err(Error::InvalidSpeakerId { speaker_id }); }; - if model_index >= MODEL_FILE_SET.sing_style_models_count() { + if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -758,13 +758,13 @@ impl InferenceCore { } let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_sing_style_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, speaker_id)) = get_sing_teacher_model_index_and_speaker_id(speaker_id) { (model_index, speaker_id) } else { return Err(Error::InvalidSpeakerId { speaker_id }); }; - if model_index >= MODEL_FILE_SET.sing_style_models_count() { + if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -888,8 +888,8 @@ fn get_talk_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> MODEL_FILE_SET.talk_speaker_id_map.get(&speaker_id).copied() } -fn get_sing_style_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { - MODEL_FILE_SET.sing_style_speaker_id_map.get(&speaker_id).copied() +fn get_sing_teacher_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET.sing_teacher_speaker_id_map.get(&speaker_id).copied() } fn get_sf_decode_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 2ebc8e742..172aa9145 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -32,7 +32,7 @@ pub(crate) static MODEL_FILE_SET: Lazy = Lazy::new(|| { pub struct Status { talk_models: StatusTalkModels, - sing_style_models: StatusSingStyleModels, + sing_teacher_models: StatusSingTeacherModels, sf_decode_models: StatusSfModels, light_session_options: SessionOptions, // 軽いモデルはこちらを使う heavy_session_options: SessionOptions, // 重いモデルはこちらを使う @@ -45,7 +45,7 @@ struct StatusTalkModels { decode: BTreeMap>, } -struct StatusSingStyleModels { +struct StatusSingTeacherModels { predict_sing_consonant_length: BTreeMap>, predict_sing_f0: BTreeMap>, predict_sing_volume: BTreeMap>, @@ -63,11 +63,11 @@ struct SessionOptions { pub(crate) struct ModelFileSet { pub(crate) talk_speaker_id_map: BTreeMap, - pub(crate) sing_style_speaker_id_map: BTreeMap, + pub(crate) sing_teacher_speaker_id_map: BTreeMap, pub(crate) sf_decode_speaker_id_map: BTreeMap, pub(crate) metas_str: String, talk_models: Vec, - sing_style_models: Vec, + sing_teacher_models: Vec, sf_decode_models: Vec, } @@ -112,10 +112,10 @@ impl ModelFileSet { ) .collect::>()?; - let sing_style_models = model_file::SING_STYLE_MODEL_FILE_NAMES + let sing_teacher_models = model_file::SING_TEACHER_MODEL_FILE_NAMES .iter() .map( - |&SingStyleModelFileNames { + |&SingTeacherModelFileNames { predict_sing_consonant_length_model, predict_sing_f0_model, predict_sing_volume_model, @@ -123,7 +123,7 @@ impl ModelFileSet { let predict_sing_consonant_length_model = ModelFile::new(&path(predict_sing_consonant_length_model))?; let predict_sing_f0_model = ModelFile::new(&path(predict_sing_f0_model))?; let predict_sing_volume_model = ModelFile::new(&path(predict_sing_volume_model))?; - Ok(SingStyleModel { + Ok(SingTeacherModel { predict_sing_consonant_length_model, predict_sing_f0_model, predict_sing_volume_model, @@ -148,11 +148,11 @@ impl ModelFileSet { return Ok(Self { talk_speaker_id_map: model_file::TALK_SPEAKER_ID_MAP.iter().copied().collect(), - sing_style_speaker_id_map: model_file::SING_STYLE_SPEAKER_ID_MAP.iter().copied().collect(), + sing_teacher_speaker_id_map: model_file::SING_STYLE_SPEAKER_ID_MAP.iter().copied().collect(), sf_decode_speaker_id_map: model_file::SOURCE_FILTER_SPEAKER_ID_MAP.iter().copied().collect(), metas_str, talk_models, - sing_style_models, + sing_teacher_models, sf_decode_models, }); @@ -163,8 +163,8 @@ impl ModelFileSet { self.talk_models.len() } - pub(crate) fn sing_style_models_count(&self) -> usize { - self.sing_style_models.len() + pub(crate) fn sing_teacher_models_count(&self) -> usize { + self.sing_teacher_models.len() } pub(crate) fn sf_decode_models_count(&self) -> usize { @@ -178,7 +178,7 @@ struct TalkModelFileNames { decode_model: &'static str, } -struct SingStyleModelFileNames { +struct SingTeacherModelFileNames { predict_sing_consonant_length_model: &'static str, predict_sing_f0_model: &'static str, predict_sing_volume_model: &'static str, @@ -198,7 +198,7 @@ struct TalkModel { decode_model: ModelFile, } -struct SingStyleModel { +struct SingTeacherModel { predict_sing_consonant_length_model: ModelFile, predict_sing_f0_model: ModelFile, predict_sing_volume_model: ModelFile, @@ -292,7 +292,7 @@ impl Status { predict_intonation: BTreeMap::new(), decode: BTreeMap::new(), }, - sing_style_models: StatusSingStyleModels { + sing_teacher_models: StatusSingTeacherModels { predict_sing_consonant_length: BTreeMap::new(), predict_sing_f0: BTreeMap::new(), predict_sing_volume: BTreeMap::new(), @@ -350,9 +350,9 @@ impl Status { && self.talk_models.decode.contains_key(&model_index) } - pub fn load_sing_style_model(&mut self, model_index: usize) -> Result<()> { - if model_index < MODEL_FILE_SET.sing_style_models.len() { - let model = &MODEL_FILE_SET.sing_style_models[model_index]; + pub fn load_sing_teacher_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.sing_teacher_models.len() { + let model = &MODEL_FILE_SET.sing_teacher_models[model_index]; let predict_sing_consonant_length_session = self.new_session(&model.predict_sing_consonant_length_model, &self.light_session_options)?; let predict_sing_f0_session = @@ -360,14 +360,14 @@ impl Status { let predict_sing_volume_session = self.new_session(&model.predict_sing_volume_model, &self.light_session_options)?; - self.sing_style_models.predict_sing_consonant_length.insert( + self.sing_teacher_models.predict_sing_consonant_length.insert( model_index, predict_sing_consonant_length_session, ); - self.sing_style_models + self.sing_teacher_models .predict_sing_f0 .insert(model_index, predict_sing_f0_session); - self.sing_style_models + self.sing_teacher_models .predict_sing_volume .insert(model_index, predict_sing_volume_session); @@ -377,16 +377,16 @@ impl Status { } } - pub fn is_sing_style_model_loaded(&self, model_index: usize) -> bool { - self.sing_style_models + pub fn is_sing_teacher_model_loaded(&self, model_index: usize) -> bool { + self.sing_teacher_models .predict_sing_consonant_length .contains_key(&model_index) && self - .sing_style_models + .sing_teacher_models .predict_sing_f0 .contains_key(&model_index) && self - .sing_style_models + .sing_teacher_models .predict_sing_volume .contains_key(&model_index) } @@ -513,7 +513,7 @@ impl Status { inputs: Vec<&mut dyn AnyArray>, ) -> Result> { if let Some(model) = self - .sing_style_models + .sing_teacher_models .predict_sing_consonant_length .get_mut(&model_index) { @@ -532,7 +532,7 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.sing_style_models.predict_sing_f0.get_mut(&model_index) { + if let Some(model) = self.sing_teacher_models.predict_sing_f0.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -549,7 +549,7 @@ impl Status { inputs: Vec<&mut dyn AnyArray>, ) -> Result> { if let Some(model) = self - .sing_style_models + .sing_teacher_models .predict_sing_volume .get_mut(&model_index) { diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs index d4ddc3a5e..d169d4174 100644 --- a/crates/voicevox_core/src/status/model_file.rs +++ b/crates/voicevox_core/src/status/model_file.rs @@ -1,4 +1,4 @@ -use super::{DecryptModelError, TalkModelFileNames, SingStyleModelFileNames, SfModelFileNames}; +use super::{DecryptModelError, TalkModelFileNames, SingTeacherModelFileNames, SfModelFileNames}; pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptModelError> { Ok(content.to_owned()) @@ -17,8 +17,8 @@ pub(super) const TALK_MODEL_FILE_NAMES: &[TalkModelFileNames] = &[ // TODO: 変更する pub(super) const SING_STYLE_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; -pub(super) const SING_STYLE_MODEL_FILE_NAMES: &[SingStyleModelFileNames] = &[ - SingStyleModelFileNames { +pub(super) const SING_TEACHER_MODEL_FILE_NAMES: &[SingTeacherModelFileNames] = &[ + SingTeacherModelFileNames { predict_sing_consonant_length_model: "predict_duration-1.onnx", predict_sing_f0_model: "predict_intonation-1.onnx", predict_sing_volume_model: "predict_intonation-1.onnx", From d2df8c73a46af72c7789697db05f6efba2832985 Mon Sep 17 00:00:00 2001 From: y-chan Date: Mon, 8 Jan 2024 22:06:22 +0900 Subject: [PATCH 16/20] fix rename miss --- crates/voicevox_core/src/status.rs | 10 +++++----- crates/voicevox_core/src/status/model_file.rs | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 172aa9145..e50138922 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -132,10 +132,10 @@ impl ModelFileSet { ) .collect::>()?; - let sf_decode_models = model_file::SOURCE_FILTER_MODEL_FILE_NAMES + let sf_decode_models = model_file::SF_DECODE_MODEL_FILE_NAMES .iter() .map( - |&SfModelFileNames { + |&SfDecodeModelFileNames { sf_decode_model, }| { let sf_decode_model = ModelFile::new(&path(sf_decode_model))?; @@ -148,8 +148,8 @@ impl ModelFileSet { return Ok(Self { talk_speaker_id_map: model_file::TALK_SPEAKER_ID_MAP.iter().copied().collect(), - sing_teacher_speaker_id_map: model_file::SING_STYLE_SPEAKER_ID_MAP.iter().copied().collect(), - sf_decode_speaker_id_map: model_file::SOURCE_FILTER_SPEAKER_ID_MAP.iter().copied().collect(), + sing_teacher_speaker_id_map: model_file::SING_TEACHER_SPEAKER_ID_MAP.iter().copied().collect(), + sf_decode_speaker_id_map: model_file::SF_DECODE_SPEAKER_ID_MAP.iter().copied().collect(), metas_str, talk_models, sing_teacher_models, @@ -184,7 +184,7 @@ struct SingTeacherModelFileNames { predict_sing_volume_model: &'static str, } -struct SfModelFileNames { +struct SfDecodeModelFileNames { sf_decode_model: &'static str, } diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs index d169d4174..e9976ff8d 100644 --- a/crates/voicevox_core/src/status/model_file.rs +++ b/crates/voicevox_core/src/status/model_file.rs @@ -1,4 +1,4 @@ -use super::{DecryptModelError, TalkModelFileNames, SingTeacherModelFileNames, SfModelFileNames}; +use super::{DecryptModelError, TalkModelFileNames, SingTeacherModelFileNames, SfDecodeModelFileNames}; pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptModelError> { Ok(content.to_owned()) @@ -15,7 +15,7 @@ pub(super) const TALK_MODEL_FILE_NAMES: &[TalkModelFileNames] = &[ ]; // TODO: 変更する -pub(super) const SING_STYLE_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; +pub(super) const SING_TEACHER_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; pub(super) const SING_TEACHER_MODEL_FILE_NAMES: &[SingTeacherModelFileNames] = &[ SingTeacherModelFileNames { @@ -25,10 +25,10 @@ pub(super) const SING_TEACHER_MODEL_FILE_NAMES: &[SingTeacherModelFileNames] = & }, ]; -pub(super) const SOURCE_FILTER_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; +pub(super) const SF_DECODE_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; -pub(super) const SOURCE_FILTER_MODEL_FILE_NAMES: &[SfModelFileNames] = &[ - SfModelFileNames { +pub(super) const SF_DECODE_MODEL_FILE_NAMES: &[SfDecodeModelFileNames] = &[ + SfDecodeModelFileNames { sf_decode_model: "decode-1.onnx", }, ]; From 78ee7cf89fbcaeefd953ef024381f27e10885dc0 Mon Sep 17 00:00:00 2001 From: y-chan Date: Mon, 8 Jan 2024 22:16:31 +0900 Subject: [PATCH 17/20] remove vector --- crates/voicevox_core/src/publish.rs | 72 ++++++++++++++--------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 917ec5156..564112695 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -159,54 +159,54 @@ impl VoicevoxCore { pub fn predict_sing_consonant_length( &mut self, - consonant_vector: &[i64], - vowel_vector: &[i64], - note_duration_vector: &[i64], + consonant: &[i64], + vowel: &[i64], + note_duration: &[i64], speaker_id: u32, ) -> Result> { self.synthesis_engine .inference_core_mut() .predict_sing_consonant_length( - consonant_vector, - vowel_vector, - note_duration_vector, + consonant, + vowel, + note_duration, speaker_id, ) } pub fn predict_sing_f0( &mut self, - phoneme_vector: &[i64], - note_vector: &[i64], + phoneme: &[i64], + note: &[i64], speaker_id: u32, ) -> Result> { self.synthesis_engine .inference_core_mut() - .predict_sing_f0(phoneme_vector, note_vector, speaker_id) + .predict_sing_f0(phoneme, note, speaker_id) } pub fn predict_sing_volume( &mut self, - phoneme_vector: &[i64], - note_vector: &[i64], - f0_vector: &[f32], + phoneme: &[i64], + note: &[i64], + f0: &[f32], speaker_id: u32, ) -> Result> { self.synthesis_engine .inference_core_mut() - .predict_sing_volume(phoneme_vector, note_vector, f0_vector, speaker_id) + .predict_sing_volume(phoneme, note, f0, speaker_id) } pub fn sf_decode( &mut self, - phoneme_vector: &[i64], + phoneme: &[i64], f0: &[f32], volume: &[f32], speaker_id: u32, ) -> Result> { self.synthesis_engine .inference_core_mut() - .sf_decode(phoneme_vector, f0, volume, speaker_id) + .sf_decode(phoneme, f0, volume, speaker_id) } pub fn audio_query( @@ -657,9 +657,9 @@ impl InferenceCore { pub fn predict_sing_consonant_length( &mut self, - consonant_vector: &[i64], - vowel_vector: &[i64], - note_duration_vector: &[i64], + consonant: &[i64], + vowel: &[i64], + note_duration: &[i64], speaker_id: u32, ) -> Result> { if !self.initialized { @@ -686,21 +686,21 @@ impl InferenceCore { return Err(Error::InvalidModelIndex { model_index }); } - let mut consonant_vector_array = NdArray::new(ndarray::arr1(consonant_vector)); - let mut vowel_vector_array = NdArray::new(ndarray::arr1(vowel_vector)); - let mut note_duration_vector_array = NdArray::new(ndarray::arr1(note_duration_vector)); + let mut consonant_array = NdArray::new(ndarray::arr1(consonant)); + let mut vowel_array = NdArray::new(ndarray::arr1(vowel)); + let mut note_duration_array = NdArray::new(ndarray::arr1(note_duration)); let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); let input_tensors: Vec<&mut dyn AnyArray> = - vec![&mut consonant_vector_array, &mut vowel_vector_array, &mut note_duration_vector_array, &mut speaker_id_array]; + vec![&mut consonant_array, &mut vowel_array, &mut note_duration_array, &mut speaker_id_array]; status.predict_sing_consonant_length_session_run(model_index, input_tensors) } pub fn predict_sing_f0( &mut self, - phoneme_vector: &[i64], - note_vector: &[i64], + phoneme: &[i64], + note: &[i64], speaker_id: u32, ) -> Result> { if !self.initialized { @@ -727,21 +727,21 @@ impl InferenceCore { return Err(Error::InvalidModelIndex { model_index }); } - let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector)); - let mut note_vector_array = NdArray::new(ndarray::arr1(note_vector)); + let mut phoneme_array = NdArray::new(ndarray::arr1(phoneme)); + let mut note_array = NdArray::new(ndarray::arr1(note)); let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); let input_tensors: Vec<&mut dyn AnyArray> = - vec![&mut phoneme_vector_array, &mut note_vector_array, &mut speaker_id_array]; + vec![&mut phoneme_array, &mut note_array, &mut speaker_id_array]; status.predict_sing_f0_session_run(model_index, input_tensors) } pub fn predict_sing_volume( &mut self, - phoneme_vector: &[i64], - note_vector: &[i64], - _f0_vector: &[f32], + phoneme: &[i64], + note: &[i64], + _f0: &[f32], speaker_id: u32, ) -> Result> { if !self.initialized { @@ -769,19 +769,19 @@ impl InferenceCore { } // TODO: f0を使う - let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector)); - let mut note_vector_array = NdArray::new(ndarray::arr1(note_vector)); + let mut phoneme_array = NdArray::new(ndarray::arr1(phoneme)); + let mut note_array = NdArray::new(ndarray::arr1(note)); let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); let input_tensors: Vec<&mut dyn AnyArray> = - vec![&mut phoneme_vector_array, &mut note_vector_array, &mut speaker_id_array]; + vec![&mut phoneme_array, &mut note_array, &mut speaker_id_array]; status.predict_sing_volume_session_run(model_index, input_tensors) } pub fn sf_decode( &mut self, - phoneme_vector: &[i64], + phoneme: &[i64], f0: &[f32], volume: &[f32], speaker_id: u32, @@ -810,13 +810,13 @@ impl InferenceCore { return Err(Error::InvalidModelIndex { model_index }); } - let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector)); + let mut phoneme_array = NdArray::new(ndarray::arr1(phoneme)); let mut f0_array = NdArray::new(ndarray::arr1(f0)); let mut volume_array = NdArray::new(ndarray::arr1(volume)); let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); let input_tensors: Vec<&mut dyn AnyArray> = - vec![&mut phoneme_vector_array, &mut f0_array, &mut volume_array, &mut speaker_id_array]; + vec![&mut phoneme_array, &mut f0_array, &mut volume_array, &mut speaker_id_array]; status.sf_decode_session_run(model_index, input_tensors) } From 1a4d0664fca73f8bd682f0aa33b9f446fc77eb1d Mon Sep 17 00:00:00 2001 From: Yuto Ashida Date: Mon, 8 Jan 2024 22:18:32 +0900 Subject: [PATCH 18/20] add TODO comment (add sing tests) Co-authored-by: Hiroshiba --- crates/voicevox_core/src/publish.rs | 2 ++ crates/voicevox_core/src/status.rs | 2 ++ 2 files changed, 4 insertions(+) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 564112695..cf604afa8 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -1174,6 +1174,8 @@ mod tests { assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len()); } + # TODO: sing系のテストを足す + #[rstest] fn decode_works() { let internal = VoicevoxCore::new_with_mutex(); diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index e50138922..a243c35a6 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -658,4 +658,6 @@ mod tests { "model should be loaded" ); } + + # TODO: sing系のテスト足す } From 4c274915fac12a205cab4c250c62a2d4463dbeec Mon Sep 17 00:00:00 2001 From: y-chan Date: Mon, 8 Jan 2024 22:21:13 +0900 Subject: [PATCH 19/20] fix comment out --- crates/voicevox_core/src/publish.rs | 2 +- crates/voicevox_core/src/status.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index cf604afa8..8d585cfca 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -1174,7 +1174,7 @@ mod tests { assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len()); } - # TODO: sing系のテストを足す + // TODO: sing系のテストを足す #[rstest] fn decode_works() { diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index a243c35a6..4e9a02a03 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -659,5 +659,5 @@ mod tests { ); } - # TODO: sing系のテスト足す + // TODO: sing系のテスト足す } From 42a0ee6f561aa22a9ecfafca72f1e05bfd1f3eba Mon Sep 17 00:00:00 2001 From: y-chan Date: Mon, 8 Jan 2024 22:39:52 +0900 Subject: [PATCH 20/20] lint --- crates/voicevox_core/src/publish.rs | 138 ++++++++++-------- crates/voicevox_core/src/status.rs | 72 ++++----- crates/voicevox_core/src/status/model_file.rs | 31 ++-- 3 files changed, 133 insertions(+), 108 deletions(-) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 8d585cfca..9c2675209 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -166,12 +166,7 @@ impl VoicevoxCore { ) -> Result> { self.synthesis_engine .inference_core_mut() - .predict_sing_consonant_length( - consonant, - vowel, - note_duration, - speaker_id, - ) + .predict_sing_consonant_length(consonant, vowel, note_duration, speaker_id) } pub fn predict_sing_f0( @@ -436,11 +431,14 @@ impl InferenceCore { } else { // ハミング機能及び歌機能モデルはどちらかが存在しない事があるので、どちらかが存在しない場合でも無視する let mut loaded = false; - if let Some((model_index, _)) = get_sing_teacher_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, _)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { status.load_sing_teacher_model(model_index)?; loaded = true; } - if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) + { status.load_sf_decode_model(model_index)?; loaded = true; } @@ -462,10 +460,13 @@ impl InferenceCore { } else { // ハミング機能及び歌機能モデルはどちらかが存在しない事があるので、どちらかが存在しない場合でも無視する let mut loaded = false; - if let Some((model_index, _)) = get_sing_teacher_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, _)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { loaded |= status.is_sing_teacher_model_loaded(model_index); } - if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) { + if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) + { loaded |= status.is_sf_decode_model_loaded(model_index); } loaded @@ -497,12 +498,13 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_talk_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_talk_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); @@ -550,12 +552,13 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_talk_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_talk_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); @@ -608,12 +611,13 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_talk_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_talk_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); @@ -675,12 +679,13 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_sing_teacher_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { return Err(Error::InvalidModelIndex { model_index }); @@ -691,8 +696,12 @@ impl InferenceCore { let mut note_duration_array = NdArray::new(ndarray::arr1(note_duration)); let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); - let input_tensors: Vec<&mut dyn AnyArray> = - vec![&mut consonant_array, &mut vowel_array, &mut note_duration_array, &mut speaker_id_array]; + let input_tensors: Vec<&mut dyn AnyArray> = vec![ + &mut consonant_array, + &mut vowel_array, + &mut note_duration_array, + &mut speaker_id_array, + ]; status.predict_sing_consonant_length_session_run(model_index, input_tensors) } @@ -716,12 +725,13 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_sing_teacher_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { return Err(Error::InvalidModelIndex { model_index }); @@ -757,12 +767,13 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_sing_teacher_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { return Err(Error::InvalidModelIndex { model_index }); @@ -799,12 +810,13 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_sf_decode_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sf_decode_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; if model_index >= MODEL_FILE_SET.sf_decode_models_count() { return Err(Error::InvalidModelIndex { model_index }); @@ -815,8 +827,12 @@ impl InferenceCore { let mut volume_array = NdArray::new(ndarray::arr1(volume)); let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); - let input_tensors: Vec<&mut dyn AnyArray> = - vec![&mut phoneme_array, &mut f0_array, &mut volume_array, &mut speaker_id_array]; + let input_tensors: Vec<&mut dyn AnyArray> = vec![ + &mut phoneme_array, + &mut f0_array, + &mut volume_array, + &mut speaker_id_array, + ]; status.sf_decode_session_run(model_index, input_tensors) } @@ -889,11 +905,17 @@ fn get_talk_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> } fn get_sing_teacher_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { - MODEL_FILE_SET.sing_teacher_speaker_id_map.get(&speaker_id).copied() + MODEL_FILE_SET + .sing_teacher_speaker_id_map + .get(&speaker_id) + .copied() } fn get_sf_decode_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { - MODEL_FILE_SET.sf_decode_speaker_id_map.get(&speaker_id).copied() + MODEL_FILE_SET + .sf_decode_speaker_id_map + .get(&speaker_id) + .copied() } pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static str { diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 4e9a02a03..d6a559947 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -120,9 +120,11 @@ impl ModelFileSet { predict_sing_f0_model, predict_sing_volume_model, }| { - let predict_sing_consonant_length_model = ModelFile::new(&path(predict_sing_consonant_length_model))?; + let predict_sing_consonant_length_model = + ModelFile::new(&path(predict_sing_consonant_length_model))?; let predict_sing_f0_model = ModelFile::new(&path(predict_sing_f0_model))?; - let predict_sing_volume_model = ModelFile::new(&path(predict_sing_volume_model))?; + let predict_sing_volume_model = + ModelFile::new(&path(predict_sing_volume_model))?; Ok(SingTeacherModel { predict_sing_consonant_length_model, predict_sing_f0_model, @@ -134,22 +136,22 @@ impl ModelFileSet { let sf_decode_models = model_file::SF_DECODE_MODEL_FILE_NAMES .iter() - .map( - |&SfDecodeModelFileNames { - sf_decode_model, - }| { - let sf_decode_model = ModelFile::new(&path(sf_decode_model))?; - Ok(SfDecodeModel { - sf_decode_model, - }) - }, - ) + .map(|&SfDecodeModelFileNames { sf_decode_model }| { + let sf_decode_model = ModelFile::new(&path(sf_decode_model))?; + Ok(SfDecodeModel { sf_decode_model }) + }) .collect::>()?; return Ok(Self { talk_speaker_id_map: model_file::TALK_SPEAKER_ID_MAP.iter().copied().collect(), - sing_teacher_speaker_id_map: model_file::SING_TEACHER_SPEAKER_ID_MAP.iter().copied().collect(), - sf_decode_speaker_id_map: model_file::SF_DECODE_SPEAKER_ID_MAP.iter().copied().collect(), + sing_teacher_speaker_id_map: model_file::SING_TEACHER_SPEAKER_ID_MAP + .iter() + .copied() + .collect(), + sf_decode_speaker_id_map: model_file::SF_DECODE_SPEAKER_ID_MAP + .iter() + .copied() + .collect(), metas_str, talk_models, sing_teacher_models, @@ -346,24 +348,30 @@ impl Status { pub fn is_talk_model_loaded(&self, model_index: usize) -> bool { self.talk_models.predict_duration.contains_key(&model_index) - && self.talk_models.predict_intonation.contains_key(&model_index) + && self + .talk_models + .predict_intonation + .contains_key(&model_index) && self.talk_models.decode.contains_key(&model_index) } pub fn load_sing_teacher_model(&mut self, model_index: usize) -> Result<()> { if model_index < MODEL_FILE_SET.sing_teacher_models.len() { let model = &MODEL_FILE_SET.sing_teacher_models[model_index]; - let predict_sing_consonant_length_session = - self.new_session(&model.predict_sing_consonant_length_model, &self.light_session_options)?; + let predict_sing_consonant_length_session = self.new_session( + &model.predict_sing_consonant_length_model, + &self.light_session_options, + )?; let predict_sing_f0_session = self.new_session(&model.predict_sing_f0_model, &self.light_session_options)?; - let predict_sing_volume_session = - self.new_session(&model.predict_sing_volume_model, &self.light_session_options)?; + let predict_sing_volume_session = self.new_session( + &model.predict_sing_volume_model, + &self.light_session_options, + )?; - self.sing_teacher_models.predict_sing_consonant_length.insert( - model_index, - predict_sing_consonant_length_session, - ); + self.sing_teacher_models + .predict_sing_consonant_length + .insert(model_index, predict_sing_consonant_length_session); self.sing_teacher_models .predict_sing_f0 .insert(model_index, predict_sing_f0_session); @@ -408,9 +416,7 @@ impl Status { } pub fn is_sf_decode_model_loaded(&self, model_index: usize) -> bool { - self.sf_decode_models - .sf_decode - .contains_key(&model_index) + self.sf_decode_models.sf_decode.contains_key(&model_index) } fn new_session( @@ -532,7 +538,11 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.sing_teacher_models.predict_sing_f0.get_mut(&model_index) { + if let Some(model) = self + .sing_teacher_models + .predict_sing_f0 + .get_mut(&model_index) + { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -568,11 +578,7 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self - .sf_decode_models - .sf_decode - .get_mut(&model_index) - { + if let Some(model) = self.sf_decode_models.sf_decode.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -658,6 +664,6 @@ mod tests { "model should be loaded" ); } - + // TODO: sing系のテスト足す } diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs index e9976ff8d..eb8f8913c 100644 --- a/crates/voicevox_core/src/status/model_file.rs +++ b/crates/voicevox_core/src/status/model_file.rs @@ -1,4 +1,6 @@ -use super::{DecryptModelError, TalkModelFileNames, SingTeacherModelFileNames, SfDecodeModelFileNames}; +use super::{ + DecryptModelError, SfDecodeModelFileNames, SingTeacherModelFileNames, TalkModelFileNames, +}; pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptModelError> { Ok(content.to_owned()) @@ -6,30 +8,25 @@ pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptMod pub(super) const TALK_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; -pub(super) const TALK_MODEL_FILE_NAMES: &[TalkModelFileNames] = &[ - TalkModelFileNames { - predict_duration_model: "predict_duration-0.onnx", - predict_intonation_model: "predict_intonation-0.onnx", - decode_model: "decode-0.onnx", - }, -]; +pub(super) const TALK_MODEL_FILE_NAMES: &[TalkModelFileNames] = &[TalkModelFileNames { + predict_duration_model: "predict_duration-0.onnx", + predict_intonation_model: "predict_intonation-0.onnx", + decode_model: "decode-0.onnx", +}]; // TODO: 変更する pub(super) const SING_TEACHER_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; -pub(super) const SING_TEACHER_MODEL_FILE_NAMES: &[SingTeacherModelFileNames] = &[ - SingTeacherModelFileNames { +pub(super) const SING_TEACHER_MODEL_FILE_NAMES: &[SingTeacherModelFileNames] = + &[SingTeacherModelFileNames { predict_sing_consonant_length_model: "predict_duration-1.onnx", predict_sing_f0_model: "predict_intonation-1.onnx", predict_sing_volume_model: "predict_intonation-1.onnx", - }, -]; + }]; pub(super) const SF_DECODE_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; -pub(super) const SF_DECODE_MODEL_FILE_NAMES: &[SfDecodeModelFileNames] = &[ - SfDecodeModelFileNames { +pub(super) const SF_DECODE_MODEL_FILE_NAMES: &[SfDecodeModelFileNames] = + &[SfDecodeModelFileNames { sf_decode_model: "decode-1.onnx", - }, -]; - + }];