diff --git a/crates/voicevox_core/src/infer/domains.rs b/crates/voicevox_core/src/infer/domains.rs index 54589b6e4..8383d931c 100644 --- a/crates/voicevox_core/src/infer/domains.rs +++ b/crates/voicevox_core/src/infer/domains.rs @@ -4,8 +4,9 @@ use educe::Educe; use serde::{Deserialize, Deserializer}; pub(crate) use self::talk::{ - DecodeInput, DecodeOutput, PredictDurationInput, PredictDurationOutput, PredictIntonationInput, - PredictIntonationOutput, TalkDomain, TalkOperation, + GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput, + PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, + RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation, }; #[derive(Educe)] diff --git a/crates/voicevox_core/src/infer/domains/talk.rs b/crates/voicevox_core/src/infer/domains/talk.rs index b7f7c1470..3dbeac762 100644 --- a/crates/voicevox_core/src/infer/domains/talk.rs +++ b/crates/voicevox_core/src/infer/domains/talk.rs @@ -41,10 +41,16 @@ pub(crate) enum TalkOperation { PredictIntonation, #[inference_operation( - type Input = DecodeInput; - type Output = DecodeOutput; + type Input = GenerateFullIntermediateInput; + type Output = GenerateFullIntermediateOutput; )] - Decode, + GenerateFullIntermediate, + + #[inference_operation( + type Input = RenderAudioSegmentInput; + type Output = RenderAudioSegmentOutput; + )] + RenderAudioSegment, } #[derive(InferenceInputSignature)] @@ -83,15 +89,28 @@ pub(crate) struct PredictIntonationOutput { #[derive(InferenceInputSignature)] #[inference_input_signature( - type Signature = Decode; + type Signature = GenerateFullIntermediate; )] -pub(crate) struct DecodeInput { +pub(crate) struct GenerateFullIntermediateInput { pub(crate) f0: Array2, pub(crate) phoneme: Array2, pub(crate) speaker_id: Array1, } #[derive(InferenceOutputSignature)] -pub(crate) struct DecodeOutput { +pub(crate) struct GenerateFullIntermediateOutput { + pub(crate) spec: Array2, +} + +#[derive(InferenceInputSignature)] +#[inference_input_signature( + type Signature = RenderAudioSegment; +)] +pub(crate) struct RenderAudioSegmentInput { + pub(crate) spec: Array2, +} + +#[derive(InferenceOutputSignature)] +pub(crate) struct RenderAudioSegmentOutput { pub(crate) wave: Array1, } diff --git a/crates/voicevox_core/src/manifest.rs b/crates/voicevox_core/src/manifest.rs index 465d64314..8a8290887 100644 --- a/crates/voicevox_core/src/manifest.rs +++ b/crates/voicevox_core/src/manifest.rs @@ -91,8 +91,11 @@ pub(crate) struct TalkManifest { #[index_for_fields(TalkOperation::PredictIntonation)] pub(crate) predict_intonation_filename: Arc, - #[index_for_fields(TalkOperation::Decode)] - pub(crate) decode_filename: Arc, + #[index_for_fields(TalkOperation::GenerateFullIntermediate)] + pub(crate) generate_full_intermediate_filename: Arc, + + #[index_for_fields(TalkOperation::RenderAudioSegment)] + pub(crate) render_audio_segment_filename: Arc, #[serde(default)] pub(crate) style_id_to_inner_voice_id: StyleIdToInnerVoiceId, diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 44c7fe22e..3234c27c4 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -375,8 +375,9 @@ mod tests { let session_options = InferenceDomainMap { talk: enum_map! { TalkOperation::PredictDuration - | TalkOperation::PredictIntonation => light_session_options, - TalkOperation::Decode => heavy_session_options, + | TalkOperation::PredictIntonation + | TalkOperation::GenerateFullIntermediate => light_session_options, + TalkOperation::RenderAudioSegment => heavy_session_options, }, }; let status = Status::new( @@ -392,9 +393,13 @@ mod tests { light_session_options, status.session_options.talk[TalkOperation::PredictIntonation], ); + assert_eq!( + light_session_options, + status.session_options.talk[TalkOperation::GenerateFullIntermediate], + ); assert_eq!( heavy_session_options, - status.session_options.talk[TalkOperation::Decode], + status.session_options.talk[TalkOperation::RenderAudioSegment], ); assert!(status.loaded_models.lock().unwrap().0.is_empty()); diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index c99352a7a..adeb010a0 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -91,9 +91,10 @@ pub(crate) mod blocking { error::ErrorRepr, infer::{ domains::{ - DecodeInput, DecodeOutput, InferenceDomainMap, PredictDurationInput, - PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, TalkDomain, - TalkOperation, + GenerateFullIntermediateInput, GenerateFullIntermediateOutput, InferenceDomainMap, + PredictDurationInput, PredictDurationOutput, PredictIntonationInput, + PredictIntonationOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput, + TalkDomain, TalkOperation, }, InferenceRuntime as _, InferenceSessionOptions, }, @@ -204,8 +205,9 @@ pub(crate) mod blocking { InferenceDomainMap { talk: enum_map! { TalkOperation::PredictDuration - | TalkOperation::PredictIntonation => light_session_options, - TalkOperation::Decode => heavy_session_options, + | TalkOperation::PredictIntonation + | TalkOperation::GenerateFullIntermediate => light_session_options, + TalkOperation::RenderAudioSegment => heavy_session_options, }, }, ); @@ -935,9 +937,9 @@ pub(crate) mod blocking { padding_size, ); - let DecodeOutput { wave: output } = self.status.run_session( + let GenerateFullIntermediateOutput { spec } = self.status.run_session( model_id, - DecodeInput { + GenerateFullIntermediateInput { f0: ndarray::arr1(&f0_with_padding) .into_shape([length_with_padding, 1]) .unwrap(), @@ -948,6 +950,10 @@ pub(crate) mod blocking { }, )?; + let RenderAudioSegmentOutput { wave: output } = self + .status + .run_session(model_id, RenderAudioSegmentInput { spec })?; + return Ok(trim_padding_from_output( output.into_raw_vec(), padding_size, diff --git a/crates/voicevox_core/src/voice_model.rs b/crates/voicevox_core/src/voice_model.rs index e6b08e634..96ff1045d 100644 --- a/crates/voicevox_core/src/voice_model.rs +++ b/crates/voicevox_core/src/voice_model.rs @@ -145,8 +145,11 @@ impl Inner { TalkOperation::PredictIntonation => { find_entry_index(&manifest.predict_intonation_filename)? } - TalkOperation::Decode => { - find_entry_index(&manifest.decode_filename)? + TalkOperation::GenerateFullIntermediate => { + find_entry_index(&manifest.generate_full_intermediate_filename)? + } + TalkOperation::RenderAudioSegment => { + find_entry_index(&manifest.render_audio_segment_filename)? } }; @@ -232,14 +235,20 @@ impl Inner { let talk = OptionFuture::from(talk.map( |(entries, style_id_to_inner_voice_id)| async move { - let [predict_duration, predict_intonation, decode] = entries.into_array(); + let [predict_duration, predict_intonation, predict_spectrogram, run_vocoder] = + entries.into_array(); let predict_duration = read_file!(predict_duration); let predict_intonation = read_file!(predict_intonation); - let decode = read_file!(decode); - - let model_bytes = - EnumMap::from_array([predict_duration, predict_intonation, decode]); + let predict_spectrogram = read_file!(predict_spectrogram); + let run_vocoder = read_file!(run_vocoder); + + let model_bytes = EnumMap::from_array([ + predict_duration, + predict_intonation, + predict_spectrogram, + run_vocoder, + ]); Ok((style_id_to_inner_voice_id, model_bytes)) }, diff --git a/crates/voicevox_core_macros/src/lib.rs b/crates/voicevox_core_macros/src/lib.rs index 933d51373..e96456ea8 100644 --- a/crates/voicevox_core_macros/src/lib.rs +++ b/crates/voicevox_core_macros/src/lib.rs @@ -119,8 +119,11 @@ pub fn derive_inference_output_signature( /// #[index_for_fields(TalkOperation::PredictIntonation)] /// pub(crate) predict_intonation_filename: Arc, /// -/// #[index_for_fields(TalkOperation::Decode)] -/// pub(crate) decode_filename: Arc, +/// #[index_for_fields(TalkOperation::GenerateFullIntermediate)] +/// pub(crate) generate_full_intermediate_filename: Arc, +/// +/// #[index_for_fields(TalkOperation::RenderAudioSegment)] +/// pub(crate) render_audio_segment_filename: Arc, /// /// // … /// } diff --git a/model/sample.vvm/manifest.json b/model/sample.vvm/manifest.json index db2ca92c1..1075a0797 100644 --- a/model/sample.vvm/manifest.json +++ b/model/sample.vvm/manifest.json @@ -5,7 +5,8 @@ "talk": { "predict_duration_filename": "predict_duration.onnx", "predict_intonation_filename": "predict_intonation.onnx", - "decode_filename": "decode.onnx", + "generate_full_intermediate_filename": "predict_spectrogram.onnx", + "render_audio_segment_filename": "vocoder.onnx", "style_id_to_inner_voice_id": { "302": 2, "303": 3 diff --git a/model/sample.vvm/predict_spectrogram.onnx b/model/sample.vvm/predict_spectrogram.onnx new file mode 100644 index 000000000..ad9da06c2 Binary files /dev/null and b/model/sample.vvm/predict_spectrogram.onnx differ diff --git a/model/sample.vvm/decode.onnx b/model/sample.vvm/vocoder.onnx similarity index 97% rename from model/sample.vvm/decode.onnx rename to model/sample.vvm/vocoder.onnx index 0551a8c16..f405f6832 100644 Binary files a/model/sample.vvm/decode.onnx and b/model/sample.vvm/vocoder.onnx differ