diff --git a/_typos.toml b/_typos.toml index 836b1d79a..fcf98cecd 100644 --- a/_typos.toml +++ b/_typos.toml @@ -8,4 +8,4 @@ NdArray="NdArray" # onnxruntime::session::NdArray [default.extend-words] [files] -extend-exclude = ["*.svg"] +extend-exclude = ["*.svg", "*.onnx"] diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 972021b79..984eba913 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -1210,113 +1210,113 @@ mod tests { assert_eq!(result.unwrap().len(), F0_LENGTH * 256); } - // #[rstest] - // fn predict_sing_f0_works() { - // let internal = VoicevoxCore::new_with_mutex(); - // internal - // .lock() - // .unwrap() - // .initialize(InitializeOptions { - // load_all_models: true, - // acceleration_mode: AccelerationMode::Cpu, - // ..Default::default() - // }) - // .unwrap(); - - // // 「テスト」という文章に対応する入力 - // let phoneme_vector = [0, 37, 14, 35, 6, 37, 30, 0]; - // let note_vector = [0, 30, 30, 40, 40, 50, 50, 0]; - - // let sing_teacher_speaker_id = 6000; - // let result = internal.lock().unwrap().predict_sing_f0( - // &phoneme_vector, - // ¬e_vector, - // sing_teacher_speaker_id, - // ); - - // assert!(result.is_ok(), "{result:?}"); - // assert_eq!(result.unwrap().len(), phoneme_vector.len()); - // } - - // #[rstest] - // fn predict_sing_volume_works() { - // let internal = VoicevoxCore::new_with_mutex(); - // internal - // .lock() - // .unwrap() - // .initialize(InitializeOptions { - // load_all_models: true, - // acceleration_mode: AccelerationMode::Cpu, - // ..Default::default() - // }) - // .unwrap(); - - // // 「テスト」という文章に対応する入力 - // let phoneme_vector = [0, 37, 14, 35, 6, 37, 30, 0]; - // let note_vector = [0, 30, 30, 40, 40, 50, 50, 0]; - // let f0_vector = [0., 5.905218, 5.905218, 0., 0., 5.565851, 5.565851, 0.]; - - // let sing_teacher_speaker_id = 6000; - // let result = internal.lock().unwrap().predict_sing_volume( - // &phoneme_vector, - // ¬e_vector, - // &f0_vector, - // sing_teacher_speaker_id, - // ); - - // assert!(result.is_ok(), "{result:?}"); - // assert_eq!(result.unwrap().len(), phoneme_vector.len()); - // } - - // #[rstest] - // fn sf_decode_works() { - // let internal = VoicevoxCore::new_with_mutex(); - // internal - // .lock() - // .unwrap() - // .initialize(InitializeOptions { - // acceleration_mode: AccelerationMode::Cpu, - // load_all_models: true, - // ..Default::default() - // }) - // .unwrap(); - - // // 「テスト」という文章に対応する入力 - // const F0_LENGTH: usize = 69; - // let mut f0 = [0.; F0_LENGTH]; - // f0[9..24].fill(5.905218); - // f0[37..60].fill(5.565851); - - // let mut volume = [0.; F0_LENGTH]; - // volume[9..24].fill(0.5); - // volume[24..37].fill(0.2); - // volume[37..60].fill(1.0); - - // let mut phoneme = [0; F0_LENGTH]; - // let mut set_one = |index, range| { - // for i in range { - // phoneme[i] = index; - // } - // }; - // set_one(0, 0..9); - // set_one(37, 9..13); - // set_one(14, 13..24); - // set_one(35, 24..30); - // set_one(6, 30..37); - // set_one(37, 37..45); - // set_one(30, 45..60); - // set_one(0, 60..69); - - // let sf_decoder_speaker_id = 3000; - // let result = - // internal - // .lock() - // .unwrap() - // .sf_decode(&phoneme, &f0, &volume, sf_decoder_speaker_id); - - // assert!(result.is_ok(), "{result:?}"); - // assert_eq!(result.unwrap().len(), F0_LENGTH * 256); - // } + #[rstest] + fn predict_sing_f0_works() { + let internal = VoicevoxCore::new_with_mutex(); + internal + .lock() + .unwrap() + .initialize(InitializeOptions { + load_all_models: true, + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }) + .unwrap(); + + // 「テスト」という文章に対応する入力 + let phoneme_vector = [0, 37, 14, 35, 6, 37, 30, 0]; + let note_vector = [0, 30, 30, 40, 40, 50, 50, 0]; + + let sing_teacher_speaker_id = 6000; + let result = internal.lock().unwrap().predict_sing_f0( + &phoneme_vector, + ¬e_vector, + sing_teacher_speaker_id, + ); + + assert!(result.is_ok(), "{result:?}"); + assert_eq!(result.unwrap().len(), phoneme_vector.len()); + } + + #[rstest] + fn predict_sing_volume_works() { + let internal = VoicevoxCore::new_with_mutex(); + internal + .lock() + .unwrap() + .initialize(InitializeOptions { + load_all_models: true, + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }) + .unwrap(); + + // 「テスト」という文章に対応する入力 + let phoneme_vector = [0, 37, 14, 35, 6, 37, 30, 0]; + let note_vector = [0, 30, 30, 40, 40, 50, 50, 0]; + let f0_vector = [0., 5.905218, 5.905218, 0., 0., 5.565851, 5.565851, 0.]; + + let sing_teacher_speaker_id = 6000; + let result = internal.lock().unwrap().predict_sing_volume( + &phoneme_vector, + ¬e_vector, + &f0_vector, + sing_teacher_speaker_id, + ); + + assert!(result.is_ok(), "{result:?}"); + assert_eq!(result.unwrap().len(), phoneme_vector.len()); + } + + #[rstest] + fn sf_decode_works() { + let internal = VoicevoxCore::new_with_mutex(); + internal + .lock() + .unwrap() + .initialize(InitializeOptions { + acceleration_mode: AccelerationMode::Cpu, + load_all_models: true, + ..Default::default() + }) + .unwrap(); + + // 「テスト」という文章に対応する入力 + const F0_LENGTH: usize = 69; + let mut f0 = [0.; F0_LENGTH]; + f0[9..24].fill(5.905218); + f0[37..60].fill(5.565851); + + let mut volume = [0.; F0_LENGTH]; + volume[9..24].fill(0.5); + volume[24..37].fill(0.2); + volume[37..60].fill(1.0); + + let mut phoneme = [0; F0_LENGTH]; + let mut set_one = |index, range| { + for i in range { + phoneme[i] = index; + } + }; + set_one(0, 0..9); + set_one(37, 9..13); + set_one(14, 13..24); + set_one(35, 24..30); + set_one(6, 30..37); + set_one(37, 37..45); + set_one(30, 45..60); + set_one(0, 60..69); + + let sf_decoder_speaker_id = 3000; + let result = + internal + .lock() + .unwrap() + .sf_decode(&phoneme, &f0, &volume, sf_decoder_speaker_id); + + assert!(result.is_ok(), "{result:?}"); + assert_eq!(result.unwrap().len(), F0_LENGTH * 256); + } type TextConsonantVowelData = [(&'static [(&'static str, &'static str, &'static str)], usize)]; diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 4dcee7b74..012d69973 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -619,6 +619,13 @@ mod tests { assert!(status.talk_models.predict_duration.is_empty()); assert!(status.talk_models.predict_intonation.is_empty()); assert!(status.talk_models.decode.is_empty()); + assert!(status + .sing_teacher_models + .predict_sing_consonant_length + .is_empty()); + assert!(status.sing_teacher_models.predict_sing_f0.is_empty()); + assert!(status.sing_teacher_models.predict_sing_volume.is_empty()); + assert!(status.sf_decode_models.sf_decode.is_empty()); assert!(status.supported_styles.is_empty()); } @@ -626,7 +633,7 @@ mod tests { fn status_load_metas_works() { let mut status = Status::new(true, 0); let result = status.load_metas(); - assert_debug_fmt_eq!(Ok(()), result); + assert_eq!(Ok(()), result); let expected = BTreeSet::from([0, 1, 2, 3, 3000, 6000]); assert_eq!(expected, status.supported_styles); } @@ -642,7 +649,7 @@ mod tests { fn status_load_talk_model_works() { let mut status = Status::new(false, 0); let result = status.load_talk_model(0); - assert_debug_fmt_eq!(Ok(()), result); + assert_eq!(Ok(()), result); assert_eq!(1, status.talk_models.predict_duration.len()); assert_eq!(1, status.talk_models.predict_intonation.len()); assert_eq!(1, status.talk_models.decode.len()); @@ -657,12 +664,66 @@ mod tests { "model should not be loaded" ); let result = status.load_talk_model(model_index); - assert_debug_fmt_eq!(Ok(()), result); + assert_eq!(Ok(()), result); assert!( status.is_talk_model_loaded(model_index), "model should be loaded" ); } - // TODO: sing系のテスト足す + #[rstest] + fn status_load_sing_teacher_model_works() { + let mut status = Status::new(false, 0); + let result = status.load_sing_teacher_model(0); + assert_eq!(Ok(()), result); + assert_eq!( + 1, + status + .sing_teacher_models + .predict_sing_consonant_length + .len() + ); + assert_eq!(1, status.sing_teacher_models.predict_sing_f0.len()); + assert_eq!(1, status.sing_teacher_models.predict_sing_volume.len()); + } + + #[rstest] + fn status_is_sing_teacher_model_loaded_works() { + let mut status = Status::new(false, 0); + let model_index = 0; + assert!( + !status.is_sing_teacher_model_loaded(model_index), + "model should not be loaded" + ); + let result = status.load_sing_teacher_model(model_index); + assert_eq!(Ok(()), result); + assert!( + status.is_sing_teacher_model_loaded(model_index), + "model should be loaded" + ); + } + + #[rstest] + fn status_load_sf_decode_model_works() { + let mut status = Status::new(false, 0); + let result = status.load_sf_decode_model(0); + assert_eq!(Ok(()), result); + assert_eq!(1, status.sf_decode_models.sf_decode.len()); + } + + #[rstest] + fn status_is_sf_decode_model_loaded_works() { + let mut status = Status::new(false, 0); + let model_index = 0; + assert!( + !status.is_sf_decode_model_loaded(model_index), + "model should not be loaded" + ); + let result = status.load_sf_decode_model(model_index); + assert_eq!(Ok(()), result); + assert!( + status.is_sf_decode_model_loaded(model_index), + "model should be loaded" + ); + } } diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs index e2d875a61..82fdd5873 100644 --- a/crates/voicevox_core/src/status/model_file.rs +++ b/crates/voicevox_core/src/status/model_file.rs @@ -14,23 +14,18 @@ pub(super) const TALK_MODEL_FILE_NAMES: &[TalkModelFileNames] = &[TalkModelFileN decode_model: "decode-0.onnx", }]; -// TODO: 変更する pub(super) const SING_TEACHER_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(6000, (0, 0))]; pub(super) const SING_TEACHER_MODEL_FILE_NAMES: &[SingTeacherModelFileNames] = &[SingTeacherModelFileNames { - predict_sing_consonant_length_model: "predict_duration-1.onnx", - predict_sing_f0_model: "predict_intonation-1.onnx", - predict_sing_volume_model: "predict_intonation-1.onnx", - // predict_sing_consonant_length_model: "predict_sing_consonant_length-0.onnx", - // predict_sing_f0_model: "predict_sing_f0-0.onnx", - // predict_sing_volume_model: "predict_sing_volume-0.onnx", + predict_sing_consonant_length_model: "predict_sing_consonant_length-0.onnx", + predict_sing_f0_model: "predict_sing_f0-0.onnx", + predict_sing_volume_model: "predict_sing_volume-0.onnx", }]; pub(super) const SF_DECODE_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(3000, (0, 0))]; pub(super) const SF_DECODE_MODEL_FILE_NAMES: &[SfDecodeModelFileNames] = &[SfDecodeModelFileNames { - sf_decode_model: "decode-1.onnx", - // sf_decode_model: "sf_decoder-0.onnx", + sf_decode_model: "sf_decode-0.onnx", }]; diff --git a/model/predict_sing_consonant_length-0.onnx b/model/predict_sing_consonant_length-0.onnx new file mode 100644 index 000000000..88a85df7a Binary files /dev/null and b/model/predict_sing_consonant_length-0.onnx differ diff --git a/model/predict_sing_f0-0.onnx b/model/predict_sing_f0-0.onnx new file mode 100644 index 000000000..026c3fb1f Binary files /dev/null and b/model/predict_sing_f0-0.onnx differ diff --git a/model/predict_sing_volume-0.onnx b/model/predict_sing_volume-0.onnx new file mode 100644 index 000000000..d80f97cba Binary files /dev/null and b/model/predict_sing_volume-0.onnx differ diff --git a/model/sf_decode-0.onnx b/model/sf_decode-0.onnx new file mode 100644 index 000000000..169285cb4 Binary files /dev/null and b/model/sf_decode-0.onnx differ