diff --git a/.github/workflows/build_and_deploy.yml b/.github/workflows/build_and_deploy.yml index 2bb1f7e88..59a55d399 100644 --- a/.github/workflows/build_and_deploy.yml +++ b/.github/workflows/build_and_deploy.yml @@ -221,7 +221,7 @@ jobs: - name: Install cargo-binstall uses: taiki-e/install-action@cargo-binstall - name: Install cargo-edit - run: cargo binstall cargo-edit@^0.11 --no-confirm --log-level debug + run: cargo binstall cargo-edit@^0.11 --no-confirm --log-level debug --locked # NOTE: release-0.15で追加 # FIXME: `--locked`はもう要らないはず - name: set cargo version run: | cargo set-version "$VERSION" --exclude voicevox_core_python_api --exclude downloader --exclude xtask diff --git a/_typos.toml b/_typos.toml index 4c3b0f48b..111d084fa 100644 --- a/_typos.toml +++ b/_typos.toml @@ -12,4 +12,4 @@ NdArray="NdArray" # onnxruntime::session::NdArray [default.extend-words] [files] -extend-exclude = ["*.svg"] +extend-exclude = ["*.svg", "*.onnx"] diff --git a/crates/voicevox_core/src/error.rs b/crates/voicevox_core/src/error.rs index 55c83097f..f0e96692c 100644 --- a/crates/voicevox_core/src/error.rs +++ b/crates/voicevox_core/src/error.rs @@ -148,6 +148,7 @@ pub enum ErrorKind { StyleNotFound, /// 音声モデルIDに対する音声モデルが見つからなかった。 ModelNotFound, + UnsupportedModel, // FIXME: dead code /// 推論に失敗した。 RunModel, /// コンテキストラベル出力に失敗した。 diff --git a/crates/voicevox_core/src/infer.rs b/crates/voicevox_core/src/infer.rs index 589c51777..49cbb476e 100644 --- a/crates/voicevox_core/src/infer.rs +++ b/crates/voicevox_core/src/infer.rs @@ -197,23 +197,33 @@ pub(crate) trait OutputScalar: Sized { fn extract(tensor: OutputTensor) -> std::result::Result, ExtractError>; } -impl OutputScalar for f32 { - const KIND: OutputScalarKind = OutputScalarKind::Float32; +#[duplicate_item( + T Kind; + [ i64 ] [ Int64 ]; + [ f32 ] [ Float32 ]; +)] +impl OutputScalar for T { + const KIND: OutputScalarKind = OutputScalarKind::Kind; fn extract(tensor: OutputTensor) -> std::result::Result, ExtractError> { match tensor { - OutputTensor::Float32(tensor) => Ok(tensor), + OutputTensor::Kind(tensor) => Ok(tensor), + _ => Err(ExtractError::Datatype), } } } #[derive(Clone, Copy, PartialEq, derive_more::Display)] pub(crate) enum OutputScalarKind { + #[display("int64_t")] + Int64, + #[display("float")] Float32, } pub(crate) enum OutputTensor { + Int64(ArrayD), Float32(ArrayD), } @@ -246,8 +256,12 @@ pub(crate) struct InferenceSessionOptions { pub(crate) device: DeviceSpec, } +// TODO: `ShapeError`を直接扱い、データ型違いはパニックにすべきでは? #[derive(Error, Debug)] pub(crate) enum ExtractError { + #[error("wrong datatype")] + Datatype, + #[error(transparent)] Shape(#[from] ShapeError), } diff --git a/crates/voicevox_core/src/infer/domains.rs b/crates/voicevox_core/src/infer/domains.rs index 8383d931c..f2f3e607a 100644 --- a/crates/voicevox_core/src/infer/domains.rs +++ b/crates/voicevox_core/src/infer/domains.rs @@ -1,75 +1,134 @@ +mod frame_decode; +mod singing_teacher; mod talk; use educe::Educe; use serde::{Deserialize, Deserializer}; -pub(crate) use self::talk::{ - GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput, - PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, - RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation, +pub(crate) use self::{ + frame_decode::{FrameDecodeDomain, FrameDecodeOperation, SfDecodeInput, SfDecodeOutput}, + singing_teacher::{ + PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput, PredictSingF0Input, + PredictSingF0Output, PredictSingVolumeInput, PredictSingVolumeOutput, SingingTeacherDomain, + SingingTeacherOperation, + }, + talk::{ + GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput, + PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, + RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation, + }, }; #[derive(Educe)] // TODO: `bounds`に`V: ?Sized`も入れようとすると、よくわからない理由で弾かれる。最新版のeduce // でもそうなのか?また最新版でも駄目だとしたら、弾いている理由は何なのか? -#[educe(Clone(bound = "V: InferenceDomainMapValues, V::Talk: Clone"))] +#[educe(Clone( + bound = "V: InferenceDomainMapValues, V::Talk: Clone, V::SingingTeacher: Clone, V::FrameDecode: Clone" +))] pub(crate) struct InferenceDomainMap { pub(crate) talk: V::Talk, + pub(crate) singing_teacher: V::SingingTeacher, + pub(crate) frame_decode: V::FrameDecode, } -impl InferenceDomainMap<(T,)> { - pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T,)> { +impl InferenceDomainMap<(T, S, F)> { + pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T, &S, &F)> { let talk = &self.talk; - InferenceDomainMap { talk } + let singing_teacher = &self.singing_teacher; + let frame_decode = &self.frame_decode; + InferenceDomainMap { + talk, + singing_teacher, + frame_decode, + } } - pub(crate) fn map T2>( + pub(crate) fn map T2, Fs: FnOnce(S) -> S2, Ff: FnOnce(F) -> F2>( self, - fs: InferenceDomainMap<(Ft,)>, - ) -> InferenceDomainMap<(T2,)> { + fs: InferenceDomainMap<(Ft, Fs, Ff)>, + ) -> InferenceDomainMap<(T2, S2, F2)> { let talk = (fs.talk)(self.talk); - InferenceDomainMap { talk } + let singing_teacher = (fs.singing_teacher)(self.singing_teacher); + let frame_decode = (fs.frame_decode)(self.frame_decode); + InferenceDomainMap { + talk, + singing_teacher, + frame_decode, + } } } -impl InferenceDomainMap<(Result,)> { - pub(crate) fn collect(self) -> Result, E> { +impl InferenceDomainMap<(Result, Result, Result)> { + pub(crate) fn collect(self) -> Result, E> { let talk = self.talk?; - Ok(InferenceDomainMap { talk }) + let singing_teacher = self.singing_teacher?; + let frame_decode = self.frame_decode?; + Ok(InferenceDomainMap { + talk, + singing_teacher, + frame_decode, + }) } } impl<'de, V: InferenceDomainMapValues + ?Sized> Deserialize<'de> for InferenceDomainMap where V::Talk: Deserialize<'de>, + V::SingingTeacher: Deserialize<'de>, + V::FrameDecode: Deserialize<'de>, { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, { - let Repr { talk } = Repr::deserialize(deserializer)?; - return Ok(Self { talk }); + let Repr { + talk, + singing_teacher, + frame_decode, + } = Repr::deserialize(deserializer)?; + return Ok(Self { + talk, + singing_teacher, + frame_decode, + }); #[derive(Deserialize)] - struct Repr { + struct Repr { talk: T, + singing_teacher: S, + frame_decode: F, } } } pub(crate) trait InferenceDomainMapValues { type Talk; + type SingingTeacher; + type FrameDecode; } -impl InferenceDomainMapValues for (T,) { +impl InferenceDomainMapValues for (T, S, F) { type Talk = T; + type SingingTeacher = S; + type FrameDecode = F; } macro_rules! inference_domain_map_values { (for<$arg:ident> $body:ty) => { - (::macros::substitute_type!( - $body where $arg = crate::infer::domains::TalkDomain as crate::infer::InferenceDomain - ),) + ( + ::macros::substitute_type!( + $body + where $arg = crate::infer::domains::TalkDomain as crate::infer::InferenceDomain + ), + ::macros::substitute_type!( + $body + where $arg = crate::infer::domains::SingingTeacherDomain as crate::infer::InferenceDomain + ), + ::macros::substitute_type!( + $body + where $arg = crate::infer::domains::FrameDecodeDomain as crate::infer::InferenceDomain + ), + ) }; } pub(crate) use inference_domain_map_values; diff --git a/crates/voicevox_core/src/infer/domains/frame_decode.rs b/crates/voicevox_core/src/infer/domains/frame_decode.rs new file mode 100644 index 000000000..c228196fc --- /dev/null +++ b/crates/voicevox_core/src/infer/domains/frame_decode.rs @@ -0,0 +1,52 @@ +use std::{collections::BTreeSet, sync::LazyLock}; + +use enum_map::Enum; +use macros::{InferenceInputSignature, InferenceOperation, InferenceOutputSignature}; +use ndarray::{Array1, Array2}; + +use crate::{manifest::FrameDecodeManifest, StyleType}; + +use super::super::{ + InferenceDomain, InferenceInputSignature as _, InferenceOutputSignature as _, OutputTensor, +}; + +pub(crate) enum FrameDecodeDomain {} + +impl InferenceDomain for FrameDecodeDomain { + type Operation = FrameDecodeOperation; + type Manifest = FrameDecodeManifest; + + fn style_types() -> &'static BTreeSet { + static STYLE_TYPES: LazyLock> = + LazyLock::new(|| [StyleType::FrameDecode, StyleType::Sing].into()); + &STYLE_TYPES + } +} + +#[derive(Clone, Copy, Enum, InferenceOperation)] +#[inference_operation( + type Domain = FrameDecodeDomain; +)] +pub(crate) enum FrameDecodeOperation { + #[inference_operation( + type Input = SfDecodeInput; + type Output = SfDecodeOutput; + )] + SfDecode, +} + +#[derive(InferenceInputSignature)] +#[inference_input_signature( + type Signature = SfDecode; +)] +pub(crate) struct SfDecodeInput { + pub(crate) frame_phonemes: Array2, + pub(crate) frame_f0s: Array2, + pub(crate) frame_volumes: Array2, + pub(crate) speaker_id: Array1, +} + +#[derive(InferenceOutputSignature)] +pub(crate) struct SfDecodeOutput { + pub(crate) wav: Array2, +} diff --git a/crates/voicevox_core/src/infer/domains/singing_teacher.rs b/crates/voicevox_core/src/infer/domains/singing_teacher.rs new file mode 100644 index 000000000..a2b9add80 --- /dev/null +++ b/crates/voicevox_core/src/infer/domains/singing_teacher.rs @@ -0,0 +1,95 @@ +use std::{collections::BTreeSet, sync::LazyLock}; + +use enum_map::Enum; +use macros::{InferenceInputSignature, InferenceOperation, InferenceOutputSignature}; +use ndarray::{Array1, Array2}; + +use crate::{manifest::SingingTeacherManifest, StyleType}; + +use super::super::{ + InferenceDomain, InferenceInputSignature as _, InferenceOutputSignature as _, OutputTensor, +}; + +pub(crate) enum SingingTeacherDomain {} + +impl InferenceDomain for SingingTeacherDomain { + type Operation = SingingTeacherOperation; + type Manifest = SingingTeacherManifest; + + fn style_types() -> &'static BTreeSet { + static STYLE_TYPES: LazyLock> = + LazyLock::new(|| [StyleType::SingingTeacher, StyleType::Sing].into()); + &STYLE_TYPES + } +} + +#[derive(Clone, Copy, Enum, InferenceOperation)] +#[inference_operation( + type Domain = SingingTeacherDomain; +)] +pub(crate) enum SingingTeacherOperation { + #[inference_operation( + type Input = PredictSingConsonantLengthInput; + type Output = PredictSingConsonantLengthOutput; + )] + PredictSingConsonantLength, + + #[inference_operation( + type Input = PredictSingF0Input; + type Output = PredictSingF0Output; + )] + PredictSingF0, + + #[inference_operation( + type Input = PredictSingVolumeInput; + type Output = PredictSingVolumeOutput; + )] + PredictSingVolume, +} + +#[derive(InferenceInputSignature)] +#[inference_input_signature( + type Signature = PredictSingConsonantLength; +)] +pub(crate) struct PredictSingConsonantLengthInput { + pub(crate) consonants: Array2, + pub(crate) vowels: Array2, + pub(crate) note_durations: Array2, + pub(crate) speaker_id: Array1, +} + +#[derive(InferenceOutputSignature)] +pub(crate) struct PredictSingConsonantLengthOutput { + pub(crate) consonant_lengths: Array2, +} + +#[derive(InferenceInputSignature)] +#[inference_input_signature( + type Signature = PredictSingF0; +)] +pub(crate) struct PredictSingF0Input { + pub(crate) phonemes: Array2, + pub(crate) notes: Array2, + pub(crate) speaker_id: Array1, +} + +#[derive(InferenceOutputSignature)] +pub(crate) struct PredictSingF0Output { + pub(crate) f0s: Array2, +} + +#[derive(InferenceInputSignature)] +#[inference_input_signature( + type Signature = PredictSingVolume; +)] +pub(crate) struct PredictSingVolumeInput { + pub(crate) phonemes: Array2, + pub(crate) notes: Array2, + pub(crate) frame_f0s: Array2, + pub(crate) speaker_id: Array1, +} + +#[derive(InferenceOutputSignature)] +pub(crate) struct PredictSingVolumeOutput { + pub(crate) volumes: Array2, +} diff --git a/crates/voicevox_core/src/infer/runtimes/onnxruntime.rs b/crates/voicevox_core/src/infer/runtimes/onnxruntime.rs index f2cc4fac7..3cfd11608 100644 --- a/crates/voicevox_core/src/infer/runtimes/onnxruntime.rs +++ b/crates/voicevox_core/src/infer/runtimes/onnxruntime.rs @@ -158,7 +158,7 @@ impl InferenceRuntime for self::blocking::Onnxruntime { TensorElementType::Uint16 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16"), TensorElementType::Int16 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16"), TensorElementType::Int32 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32"), - TensorElementType::Int64 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64"), + TensorElementType::Int64 => Ok(OutputScalarKind::Int64), TensorElementType::String => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING"), TensorElementType::Bfloat16 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16"), TensorElementType::Float16 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16"), @@ -253,6 +253,10 @@ fn extract_outputs(outputs: &ort::SessionOutputs<'_, '_>) -> anyhow::Result { + let output = output.try_extract_tensor::()?; + Ok(OutputTensor::Int64(output.into_owned())) + } TensorElementType::Float32 => { let output = output.try_extract_tensor::()?; Ok(OutputTensor::Float32(output.into_owned())) diff --git a/crates/voicevox_core/src/manifest.rs b/crates/voicevox_core/src/manifest.rs index 740254f6f..0c11c3647 100644 --- a/crates/voicevox_core/src/manifest.rs +++ b/crates/voicevox_core/src/manifest.rs @@ -13,7 +13,10 @@ use serde::{de, Deserialize, Deserializer, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use crate::{ - infer::domains::{inference_domain_map_values, InferenceDomainMap, TalkOperation}, + infer::domains::{ + inference_domain_map_values, FrameDecodeOperation, InferenceDomainMap, + SingingTeacherOperation, TalkOperation, + }, StyleId, VoiceModelId, }; @@ -82,6 +85,7 @@ pub struct Manifest { pub(crate) type ManifestDomains = inference_domain_map_values!(for Option); +// TODO: #825 が終わったら`singing_teacher`と`frame_decode`のやつと統一する #[derive(Deserialize)] #[cfg_attr(test, derive(Default))] pub(crate) struct TalkManifest { @@ -92,6 +96,26 @@ pub(crate) struct TalkManifest { pub(crate) style_id_to_inner_voice_id: StyleIdToInnerVoiceId, } +#[derive(Deserialize)] +#[cfg_attr(test, derive(Default))] +pub(crate) struct SingingTeacherManifest { + #[serde(flatten)] + filenames: EnumMap>, + + #[serde(default)] + pub(crate) style_id_to_inner_voice_id: StyleIdToInnerVoiceId, +} + +#[derive(Deserialize)] +#[cfg_attr(test, derive(Default))] +pub(crate) struct FrameDecodeManifest { + #[serde(flatten)] + filenames: EnumMap>, + + #[serde(default)] + pub(crate) style_id_to_inner_voice_id: StyleIdToInnerVoiceId, +} + // TODO: #825 では`TalkOperation`と統合する。`Index`の実装もderive_moreで委譲する #[derive(Enum, Deserialize)] pub(crate) enum TalkOperationFilenameKey { @@ -121,6 +145,52 @@ impl Index for TalkManifest { } } +#[derive(Enum, Deserialize)] +pub(crate) enum SingingTeacherOperationFilenameKey { + #[serde(rename = "predict_sing_consonant_length_filename")] + PredictSingConsonantLength, + #[serde(rename = "predict_sing_f0_filename")] + PredictSingF0, + #[serde(rename = "predict_sing_volume_filename")] + PredictSingVolume, +} + +impl Index for SingingTeacherManifest { + type Output = Arc; + + fn index(&self, index: SingingTeacherOperation) -> &Self::Output { + let key = match index { + SingingTeacherOperation::PredictSingConsonantLength => { + SingingTeacherOperationFilenameKey::PredictSingConsonantLength + } + SingingTeacherOperation::PredictSingF0 => { + SingingTeacherOperationFilenameKey::PredictSingF0 + } + SingingTeacherOperation::PredictSingVolume => { + SingingTeacherOperationFilenameKey::PredictSingVolume + } + }; + &self.filenames[key] + } +} + +#[derive(Enum, Deserialize)] +pub(crate) enum FrameDecodeOperationFilenameKey { + #[serde(rename = "sf_decode_filename")] + SfDecode, +} + +impl Index for FrameDecodeManifest { + type Output = Arc; + + fn index(&self, index: FrameDecodeOperation) -> &Self::Output { + let key = match index { + FrameDecodeOperation::SfDecode => FrameDecodeOperationFilenameKey::SfDecode, + }; + &self.filenames[key] + } +} + #[serde_as] #[derive(Default, Clone, Deref, Deserialize)] #[deref(forward)] diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index c59573412..48ecb39d2 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -10,7 +10,10 @@ use crate::{ error::{ErrorRepr, LoadModelError, LoadModelErrorKind, LoadModelResult}, infer::{ self, - domains::{inference_domain_map_values, InferenceDomainMap, TalkDomain}, + domains::{ + inference_domain_map_values, FrameDecodeDomain, InferenceDomainMap, + SingingTeacherDomain, TalkDomain, + }, session_set::{InferenceSessionCell, InferenceSessionSet}, InferenceDomain, InferenceInputSignature, InferenceRuntime, InferenceSessionOptions, InferenceSignature, @@ -296,8 +299,10 @@ pub(crate) trait InferenceDomainExt: InferenceDomain { } #[duplicate_item( - T field; - [ TalkDomain ] [ talk ]; + T field; + [ TalkDomain ] [ talk ]; + [ SingingTeacherDomain ] [ singing_teacher ]; + [ FrameDecodeDomain ] [ frame_decode ]; )] impl InferenceDomainExt for T { fn visit( @@ -325,6 +330,8 @@ impl InferenceDomainMap { [ field; [ talk ]; + [ singing_teacher ]; + [ frame_decode ]; ] let field = self .field @@ -336,7 +343,11 @@ impl InferenceDomainMap { .transpose()?; } - Ok(InferenceDomainMap { talk }) + Ok(InferenceDomainMap { + talk, + singing_teacher, + frame_decode, + }) } } @@ -355,7 +366,9 @@ mod tests { use crate::{ devices::{DeviceSpec, GpuSpec}, infer::{ - domains::{InferenceDomainMap, TalkOperation}, + domains::{ + FrameDecodeOperation, InferenceDomainMap, SingingTeacherOperation, TalkOperation, + }, InferenceSessionOptions, }, macros::tests::assert_debug_fmt_eq, @@ -381,6 +394,14 @@ mod tests { | TalkOperation::GenerateFullIntermediate => light_session_options, TalkOperation::RenderAudioSegment => heavy_session_options, }, + singing_teacher: enum_map! { + SingingTeacherOperation::PredictSingConsonantLength + | SingingTeacherOperation::PredictSingF0 + | SingingTeacherOperation::PredictSingVolume => light_session_options, + }, + frame_decode: enum_map! { + FrameDecodeOperation::SfDecode => heavy_session_options, + }, }; let status = Status::new( crate::blocking::Onnxruntime::from_test_util_data().unwrap(), @@ -414,6 +435,8 @@ mod tests { crate::blocking::Onnxruntime::from_test_util_data().unwrap(), InferenceDomainMap { talk: enum_map!(_ => InferenceSessionOptions::new(0, DeviceSpec::Cpu)), + singing_teacher: enum_map!(_ => InferenceSessionOptions::new(0, DeviceSpec::Cpu)), + frame_decode: enum_map!(_ => InferenceSessionOptions::new(0, DeviceSpec::Cpu)), }, ); let model = &crate::nonblocking::VoiceModelFile::sample().await.unwrap(); @@ -430,6 +453,8 @@ mod tests { crate::blocking::Onnxruntime::from_test_util_data().unwrap(), InferenceDomainMap { talk: enum_map!(_ => InferenceSessionOptions::new(0, DeviceSpec::Cpu)), + singing_teacher: enum_map!(_ => InferenceSessionOptions::new(0, DeviceSpec::Cpu)), + frame_decode: enum_map!(_ => InferenceSessionOptions::new(0, DeviceSpec::Cpu)), }, ); let vvm = &crate::nonblocking::VoiceModelFile::sample().await.unwrap(); diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 230a26294..61f9dbf98 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -101,6 +101,7 @@ impl AsyncExt for BlockingThreadPool { } mod inner { + use easy_ext::ext; use enum_map::enum_map; use std::{ io::{Cursor, Write as _}, @@ -118,9 +119,13 @@ mod inner { infer::{ self, domains::{ - GenerateFullIntermediateInput, GenerateFullIntermediateOutput, InferenceDomainMap, - PredictDurationInput, PredictDurationOutput, PredictIntonationInput, - PredictIntonationOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput, + FrameDecodeDomain, FrameDecodeOperation, GenerateFullIntermediateInput, + GenerateFullIntermediateOutput, InferenceDomainMap, PredictDurationInput, + PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, + PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput, + PredictSingF0Input, PredictSingF0Output, PredictSingVolumeInput, + PredictSingVolumeOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput, + SfDecodeInput, SfDecodeOutput, SingingTeacherDomain, SingingTeacherOperation, TalkDomain, TalkOperation, }, InferenceRuntime, InferenceSessionOptions, @@ -174,7 +179,7 @@ mod inner { } pub struct Inner { - status: Arc>, + pub(super) status: Arc>, open_jtalk_analyzer: OpenJTalkAnalyzer, kana_analyzer: KanaAnalyzer, use_gpu: bool, @@ -250,6 +255,14 @@ mod inner { | TalkOperation::GenerateFullIntermediate => light_session_options, TalkOperation::RenderAudioSegment => heavy_session_options, }, + singing_teacher: enum_map! { + SingingTeacherOperation::PredictSingConsonantLength + | SingingTeacherOperation::PredictSingF0 + | SingingTeacherOperation::PredictSingVolume => light_session_options, + }, + frame_decode: enum_map! { + FrameDecodeOperation::SfDecode => heavy_session_options, + }, }, ) .into(); @@ -1053,6 +1066,108 @@ mod inner { let output = trim_margin_from_wave(output_with_margin); Ok(output.to_vec()) } + + pub(super) async fn predict_sing_consonant_length( + &self, + consonant: ndarray::Array1, + vowel: ndarray::Array1, + note_duration: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let PredictSingConsonantLengthOutput { consonant_lengths } = self + .run_session::( + model_id, + PredictSingConsonantLengthInput { + consonants: consonant.into_one_row(), + vowels: vowel.into_one_row(), + note_durations: note_duration.into_one_row(), + speaker_id: ndarray::array![inner_voice_id.raw_id().into()], + }, + ) + .await?; + + Ok(consonant_lengths) + } + + pub(super) async fn predict_sing_f0( + &self, + phoneme: ndarray::Array1, + note: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let PredictSingF0Output { f0s } = self + .run_session::( + model_id, + PredictSingF0Input { + phonemes: phoneme.into_one_row(), + notes: note.into_one_row(), + speaker_id: ndarray::array![inner_voice_id.raw_id().into()], + }, + ) + .await?; + + Ok(f0s) + } + + pub(super) async fn predict_sing_volume( + &self, + phoneme: ndarray::Array1, + note: ndarray::Array1, + f0: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let PredictSingVolumeOutput { volumes } = self + .run_session::( + model_id, + PredictSingVolumeInput { + phonemes: phoneme.into_one_row(), + notes: note.into_one_row(), + frame_f0s: f0.into_one_row(), + speaker_id: ndarray::array![inner_voice_id.raw_id().into()], + }, + ) + .await?; + + Ok(volumes) + } + + pub(super) async fn sf_decode( + &self, + phoneme: ndarray::Array1, + f0: ndarray::Array1, + volume: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let SfDecodeOutput { wav } = self + .run_session::( + model_id, + SfDecodeInput { + frame_phonemes: phoneme.into_one_row(), + frame_f0s: f0.into_one_row(), + frame_volumes: volume.into_one_row(), + speaker_id: ndarray::array![inner_voice_id.raw_id().into()], + }, + ) + .await?; + + Ok(wav) + } + } + + #[ext] + impl ndarray::Array1 { + fn into_one_row(self) -> ndarray::Array2 { + let n = self.len(); + self.into_shape([1, n]).expect("should be ok") + } } #[cfg(windows)] @@ -1590,6 +1705,62 @@ pub(crate) mod blocking { .decode(length, phoneme_size, f0, phoneme_vector, style_id) .block_on() } + + pub fn predict_sing_consonant_length( + &self, + consonant: ndarray::Array1, + vowel: ndarray::Array1, + note_duration: ndarray::Array1, + style_id: StyleId, + ) -> crate::Result> { + self.0 + .status + .predict_sing_consonant_length::( + consonant, + vowel, + note_duration, + style_id, + ) + .block_on() + } + + pub fn predict_sing_f0( + &self, + phoneme: ndarray::Array1, + note: ndarray::Array1, + style_id: StyleId, + ) -> crate::Result> { + self.0 + .status + .predict_sing_f0::(phoneme, note, style_id) + .block_on() + } + + pub fn predict_sing_volume( + &self, + phoneme: ndarray::Array1, + note: ndarray::Array1, + f0: ndarray::Array1, + style_id: StyleId, + ) -> crate::Result> { + self.0 + .status + .predict_sing_volume::(phoneme, note, f0, style_id) + .block_on() + } + + pub fn sf_decode( + &self, + phoneme: ndarray::Array1, + f0: ndarray::Array1, + volume: ndarray::Array1, + style_id: StyleId, + ) -> crate::Result> { + self.0 + .status + .sf_decode::(phoneme, f0, volume, style_id) + .block_on() + } } } @@ -1903,7 +2074,10 @@ pub(crate) mod nonblocking { #[cfg(test)] mod tests { use super::{AccelerationMode, InitializeOptions}; - use crate::{engine::Mora, macros::tests::assert_debug_fmt_eq, AccentPhrase, Result, StyleId}; + use crate::{ + asyncs::BlockingThreadPool, engine::Mora, macros::tests::assert_debug_fmt_eq, AccentPhrase, + Result, StyleId, + }; use ::test_util::OPEN_JTALK_DIC_DIR; use rstest::rstest; @@ -2064,6 +2238,9 @@ mod tests { assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len()); } + // TODO: ↓ 有るのでは? + // TODO: sing系のテストを足す + #[rstest] #[tokio::test] async fn decode_works() { @@ -2114,6 +2291,145 @@ mod tests { assert_eq!(result.unwrap().len(), F0_LENGTH * 256); } + #[rstest] + #[tokio::test] + async fn predict_sing_f0_works() { + let syntesizer = super::nonblocking::Synthesizer::new( + crate::nonblocking::Onnxruntime::from_test_util_data() + .await + .unwrap(), + (), + &InitializeOptions { + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }, + ) + .unwrap(); + syntesizer + .load_voice_model(&crate::nonblocking::VoiceModelFile::sample().await.unwrap()) + .await + .unwrap(); + + // 「テスト」という文章に対応する入力 + let phoneme_vector = ndarray::array![0, 37, 14, 35, 6, 37, 30, 0]; + let note_vector = ndarray::array![0, 30, 30, 40, 40, 50, 50, 0]; + + let sing_teacher_style_id = StyleId::new(6000); + let result = syntesizer + .0 + .status + .predict_sing_f0::( + phoneme_vector.clone(), + note_vector, + sing_teacher_style_id, + ) + .await; + + assert!(result.is_ok(), "{result:?}"); + assert_eq!(result.unwrap().len(), phoneme_vector.len()); + } + + #[rstest] + #[tokio::test] + async fn predict_sing_volume_works() { + let syntesizer = super::nonblocking::Synthesizer::new( + crate::nonblocking::Onnxruntime::from_test_util_data() + .await + .unwrap(), + (), + &InitializeOptions { + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }, + ) + .unwrap(); + syntesizer + .load_voice_model(&crate::nonblocking::VoiceModelFile::sample().await.unwrap()) + .await + .unwrap(); + + // 「テスト」という文章に対応する入力 + let phoneme_vector = ndarray::array![0, 37, 14, 35, 6, 37, 30, 0]; + let note_vector = ndarray::array![0, 30, 30, 40, 40, 50, 50, 0]; + let f0_vector = ndarray::array![0., 5.905218, 5.905218, 0., 0., 5.565851, 5.565851, 0.]; + + let sing_teacher_style_id = StyleId::new(6000); + let result = syntesizer + .0 + .status + .predict_sing_volume::( + phoneme_vector.clone(), + note_vector, + f0_vector, + sing_teacher_style_id, + ) + .await; + + assert!(result.is_ok(), "{result:?}"); + assert_eq!(result.unwrap().len(), phoneme_vector.len()); + } + + #[rstest] + #[tokio::test] + async fn sf_decode_works() { + let syntesizer = super::nonblocking::Synthesizer::new( + crate::nonblocking::Onnxruntime::from_test_util_data() + .await + .unwrap(), + (), + &InitializeOptions { + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }, + ) + .unwrap(); + syntesizer + .load_voice_model(&crate::nonblocking::VoiceModelFile::sample().await.unwrap()) + .await + .unwrap(); + + // 「テスト」という文章に対応する入力 + const F0_LENGTH: usize = 69; + let mut f0 = [0.; F0_LENGTH]; + f0[9..24].fill(5.905218); + f0[37..60].fill(5.565851); + + let mut volume = [0.; F0_LENGTH]; + volume[9..24].fill(0.5); + volume[24..37].fill(0.2); + volume[37..60].fill(1.0); + + let mut phoneme = [0; F0_LENGTH]; + let mut set_one = |index, range| { + for i in range { + phoneme[i] = index; + } + }; + set_one(0, 0..9); + set_one(37, 9..13); + set_one(14, 13..24); + set_one(35, 24..30); + set_one(6, 30..37); + set_one(37, 37..45); + set_one(30, 45..60); + set_one(0, 60..69); + + let sf_decode_style_id = StyleId::new(3000); + let result = syntesizer + .0 + .status + .sf_decode::( + ndarray::arr1(&phoneme), + ndarray::arr1(&f0), + ndarray::arr1(&volume), + sf_decode_style_id, + ) + .await; + + assert!(result.is_ok(), "{result:?}"); + assert_eq!(result.unwrap().len(), F0_LENGTH * 256); + } + type TextConsonantVowelData = [(&'static [(&'static str, &'static str, &'static str)], usize)]; diff --git a/crates/voicevox_core/src/voice_model.rs b/crates/voicevox_core/src/voice_model.rs index d0dde486f..754064a29 100644 --- a/crates/voicevox_core/src/voice_model.rs +++ b/crates/voicevox_core/src/voice_model.rs @@ -23,7 +23,10 @@ use crate::{ asyncs::{Async, Mutex as _}, error::{LoadModelError, LoadModelErrorKind, LoadModelResult}, infer::{ - domains::{inference_domain_map_values, InferenceDomainMap, TalkDomain}, + domains::{ + inference_domain_map_values, FrameDecodeDomain, InferenceDomainMap, + SingingTeacherDomain, TalkDomain, + }, InferenceDomain, }, manifest::{Manifest, ManifestDomains, StyleIdToInnerVoiceId}, @@ -153,6 +156,42 @@ impl Inner { ) }) }, + singing_teacher: |singing_teacher| { + singing_teacher + .as_ref() + .map(|manifest| { + let indices = EnumMap::from_fn(|k| &manifest[k]) + .try_map(|_, s| find_entry_index(s))?; + Ok(InferenceModelEntry { indices, manifest }) + }) + .transpose() + .map_err(move |source| { + error( + LoadModelErrorKind::ReadZipEntry { + filename: MANIFEST_FILENAME.to_owned(), + }, + source, + ) + }) + }, + frame_decode: |frame_decode| { + frame_decode + .as_ref() + .map(|manifest| { + let indices = EnumMap::from_fn(|k| &manifest[k]) + .try_map(|_, s| find_entry_index(s))?; + Ok(InferenceModelEntry { indices, manifest }) + }) + .transpose() + .map_err(move |source| { + error( + LoadModelErrorKind::ReadZipEntry { + filename: MANIFEST_FILENAME.to_owned(), + }, + source, + ) + }) + }, }) .collect() .map_err(crate::Error::from) @@ -225,36 +264,78 @@ impl Inner { }}; } - let InferenceDomainMap { talk } = - self.with_inference_model_entries(|inference_model_entries| { - inference_model_entries.each_ref().map(InferenceDomainMap { - talk: |talk| { - talk.as_ref() - .map(|InferenceModelEntry { indices, manifest }| { - ( - indices.map(|op, i| (i, manifest[op].clone())), - manifest.style_id_to_inner_voice_id.clone(), - ) - }) - }, - }) - }); + let InferenceDomainMap { + talk, + singing_teacher, + frame_decode, + } = self.with_inference_model_entries(|inference_model_entries| { + inference_model_entries.each_ref().map(InferenceDomainMap { + talk: |talk| { + talk.as_ref() + .map(|InferenceModelEntry { indices, manifest }| { + ( + indices.map(|op, i| (i, manifest[op].clone())), + manifest.style_id_to_inner_voice_id.clone(), + ) + }) + }, + singing_teacher: |singing_teacher| { + singing_teacher + .as_ref() + .map(|InferenceModelEntry { indices, manifest }| { + ( + indices.map(|op, i| (i, manifest[op].clone())), + manifest.style_id_to_inner_voice_id.clone(), + ) + }) + }, + frame_decode: |frame_decode| { + frame_decode + .as_ref() + .map(|InferenceModelEntry { indices, manifest }| { + ( + indices.map(|op, i| (i, manifest[op].clone())), + manifest.style_id_to_inner_voice_id.clone(), + ) + }) + }, + }) + }); + + let talk = OptionFuture::from(talk.map(|(entries, style_id_to_inner_voice_id)| async { + let [predict_duration, predict_intonation, predict_spectrogram, run_vocoder] = + entries.into_array(); + + let predict_duration = read_file!(predict_duration); + let predict_intonation = read_file!(predict_intonation); + let predict_spectrogram = read_file!(predict_spectrogram); + let run_vocoder = read_file!(run_vocoder); + + let model_bytes = EnumMap::from_array([ + predict_duration, + predict_intonation, + predict_spectrogram, + run_vocoder, + ]); + + Ok((style_id_to_inner_voice_id, model_bytes)) + })) + .await + .transpose()?; - let talk = OptionFuture::from(talk.map( - |(entries, style_id_to_inner_voice_id)| async move { - let [predict_duration, predict_intonation, predict_spectrogram, run_vocoder] = + let singing_teacher = OptionFuture::from(singing_teacher.map( + |(entries, style_id_to_inner_voice_id)| async { + let [predict_sing_consonant_length, predict_sing_f0, predict_sing_volume] = entries.into_array(); - let predict_duration = read_file!(predict_duration); - let predict_intonation = read_file!(predict_intonation); - let predict_spectrogram = read_file!(predict_spectrogram); - let run_vocoder = read_file!(run_vocoder); + let predict_sing_consonant_length = read_file!(predict_sing_consonant_length); + let predict_sing_f0 = read_file!(predict_sing_f0); + let predict_sing_volume = read_file!(predict_sing_volume); let model_bytes = EnumMap::from_array([ - predict_duration, - predict_intonation, - predict_spectrogram, - run_vocoder, + predict_sing_consonant_length, + predict_sing_f0, + predict_sing_volume, ]); Ok((style_id_to_inner_voice_id, model_bytes)) @@ -263,7 +344,25 @@ impl Inner { .await .transpose()?; - Ok(InferenceDomainMap { talk }) + let frame_decode = OptionFuture::from(frame_decode.map( + |(entries, style_id_to_inner_voice_id)| async { + let [sf_decode] = entries.into_array(); + + let sf_decode = read_file!(sf_decode); + + let model_bytes = EnumMap::from_array([sf_decode]); + + Ok((style_id_to_inner_voice_id, model_bytes)) + }, + )) + .await + .transpose()?; + + Ok(InferenceDomainMap { + talk, + singing_teacher, + frame_decode, + }) } } @@ -396,9 +495,15 @@ impl InferenceDomainMap { /// /// 例えば`self.talk`が`None`のとき、`StyleType::Talk`に対して`false`を返す。 fn accepts(&self, style_type: StyleType) -> bool { - let Self { talk } = self; + let Self { + talk, + singing_teacher, + frame_decode, + } = self; - return TalkDomain::contains(style_type).implies(|| talk.is_some()); + return TalkDomain::contains(style_type).implies(|| talk.is_some()) + && SingingTeacherDomain::contains(style_type).implies(|| singing_teacher.is_some()) + && FrameDecodeDomain::contains(style_type).implies(|| frame_decode.is_some()); #[ext] impl D { @@ -503,7 +608,7 @@ mod tests { use crate::{ infer::domains::InferenceDomainMap, - manifest::{ManifestDomains, TalkManifest}, + manifest::{FrameDecodeManifest, ManifestDomains, SingingTeacherManifest, TalkManifest}, SpeakerMeta, StyleType, }; @@ -511,6 +616,8 @@ mod tests { #[case( &InferenceDomainMap { talk: None, + singing_teacher: None, + frame_decode: None, }, &[], Ok(()) @@ -518,6 +625,8 @@ mod tests { #[case( &InferenceDomainMap { talk: Some(TalkManifest::default()), + singing_teacher: Some(SingingTeacherManifest::default()), + frame_decode: Some(FrameDecodeManifest::default()), }, &[speaker(&[StyleType::Talk])], Ok(()) @@ -525,6 +634,8 @@ mod tests { #[case( &InferenceDomainMap { talk: Some(TalkManifest::default()), + singing_teacher: Some(SingingTeacherManifest::default()), + frame_decode: Some(FrameDecodeManifest::default()), }, &[speaker(&[StyleType::Talk, StyleType::Sing])], Ok(()) @@ -532,6 +643,8 @@ mod tests { #[case( &InferenceDomainMap { talk: None, + singing_teacher: None, + frame_decode: None, }, &[speaker(&[StyleType::Talk])], Err(()) diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs index 78805369e..ab191164c 100644 --- a/crates/voicevox_core_c_api/src/compatible_engine.rs +++ b/crates/voicevox_core_c_api/src/compatible_engine.rs @@ -468,6 +468,214 @@ pub unsafe extern "C" fn render_audio_segment( } } +/// # Safety +/// +/// - `consonant`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `vowel`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `note_duration`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。 +/// - `output`はRustの`&mut [MaybeUninit; length as usize]`として解釈できなければならない。 +#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない +pub unsafe extern "C" fn predict_sing_consonant_length_forward( + length: i64, + consonant: *mut i64, + vowel: *mut i64, + note_duration: *mut i64, + speaker_id: *mut i64, + output: *mut i64, +) -> bool { + init_logger_once(); + assert_aligned(consonant); + assert_aligned(vowel); + assert_aligned(note_duration); + assert_aligned(speaker_id); + assert_aligned(output); + let length = length as usize; + let synthesizer = &*lock_synthesizer(); + let result = ensure_initialized!(synthesizer).predict_sing_consonant_length( + // SAFETY: The safety contract must be upheld by the caller. + unsafe { ndarray::ArrayView::from_shape_ptr([length], consonant) }.to_owned(), + unsafe { ndarray::ArrayView::from_shape_ptr([length], vowel) }.to_owned(), + unsafe { ndarray::ArrayView::from_shape_ptr([length], note_duration) }.to_owned(), + StyleId::new(unsafe { *speaker_id as u32 }), + ); + match result { + Ok(output_arr) => { + let output_len = length; + if output_arr.len() != output_len { + panic!("expected {}, got {}", output_len, output_arr.len()); + } + let output_arr = output_arr.as_standard_layout(); + // SAFETY: The safety contract must be upheld by the caller. + unsafe { + output_arr + .as_ptr() + .copy_to_nonoverlapping(output, output_len); + } + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +/// # Safety +/// +/// - `phoneme`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `note`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。 +/// - `output`はRustの`&mut [MaybeUninit; length as usize]`として解釈できなければならない。 +#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない +pub unsafe extern "C" fn predict_sing_f0_forward( + length: i64, + phoneme: *mut i64, + note: *mut i64, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + init_logger_once(); + assert_aligned(phoneme); + assert_aligned(note); + assert_aligned(speaker_id); + assert_aligned(output); + let length = length as usize; + let synthesizer = &*lock_synthesizer(); + let result = ensure_initialized!(synthesizer).predict_sing_f0( + // SAFETY: The safety contract must be upheld by the caller. + unsafe { ndarray::ArrayView::from_shape_ptr([length], phoneme) }.to_owned(), + unsafe { ndarray::ArrayView::from_shape_ptr([length], note) }.to_owned(), + StyleId::new(unsafe { *speaker_id as u32 }), + ); + match result { + Ok(output_arr) => { + let output_len = length; + if output_arr.len() != output_len { + panic!("expected {}, got {}", output_len, output_arr.len()); + } + let output_arr = output_arr.as_standard_layout(); + // SAFETY: The safety contract must be upheld by the caller. + unsafe { + output_arr + .as_ptr() + .copy_to_nonoverlapping(output, output_len); + } + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +/// # Safety +/// +/// - `phoneme`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `note`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `f0`はRustの`&[f32; length as usize]`として解釈できなければならない。 +/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。 +/// - `output`はRustの`&mut [MaybeUninit; length as usize]`として解釈できなければならない。 +#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない +pub unsafe extern "C" fn predict_sing_volume_forward( + length: i64, + phoneme: *mut i64, + note: *mut i64, + f0: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + init_logger_once(); + assert_aligned(phoneme); + assert_aligned(note); + assert_aligned(f0); + assert_aligned(speaker_id); + assert_aligned(output); + let length = length as usize; + let synthesizer = &*lock_synthesizer(); + let result = ensure_initialized!(synthesizer).predict_sing_volume( + // SAFETY: The safety contract must be upheld by the caller. + unsafe { ndarray::ArrayView::from_shape_ptr([length], phoneme) }.to_owned(), + unsafe { ndarray::ArrayView::from_shape_ptr([length], note) }.to_owned(), + unsafe { ndarray::ArrayView::from_shape_ptr([length], f0) }.to_owned(), + StyleId::new(unsafe { *speaker_id as u32 }), + ); + match result { + Ok(output_arr) => { + let output_len = length; + if output_arr.len() != output_len { + panic!("expected {}, got {}", output_len, output_arr.len()); + } + let output_arr = output_arr.as_standard_layout(); + // SAFETY: The safety contract must be upheld by the caller. + unsafe { + output_arr + .as_ptr() + .copy_to_nonoverlapping(output, output_len); + } + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +/// # Safety +/// +/// - `phoneme`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `f0`はRustの`&[f32; length as usize]`として解釈できなければならない。 +/// - `volume`はRustの`&[f32; length as usize]`として解釈できなければならない。 +/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。 +/// - `output`はRustの`&mut [MaybeUninit; length as usize]`として解釈できなければならない。 +#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない +pub unsafe extern "C" fn sf_decode_forward( + length: i64, + phoneme: *mut i64, + f0: *mut f32, + volume: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + init_logger_once(); + assert_aligned(phoneme); + assert_aligned(f0); + assert_aligned(volume); + assert_aligned(speaker_id); + assert_aligned(output); + let length = length as usize; + let synthesizer = &*lock_synthesizer(); + let result = ensure_initialized!(synthesizer).sf_decode( + // SAFETY: The safety contract must be upheld by the caller. + unsafe { ndarray::ArrayView::from_shape_ptr([length], phoneme) }.to_owned(), + unsafe { ndarray::ArrayView::from_shape_ptr([length], f0) }.to_owned(), + unsafe { ndarray::ArrayView::from_shape_ptr([length], volume) }.to_owned(), + StyleId::new(unsafe { *speaker_id as u32 }), + ); + match result { + Ok(output_arr) => { + let output_len = length * 256; + if output_arr.len() != output_len { + panic!("expected {}, got {}", output_len, output_arr.len()); + } + let output_arr = output_arr.as_standard_layout(); + // SAFETY: The safety contract must be upheld by the caller. + unsafe { + output_arr + .as_ptr() + .copy_to_nonoverlapping(output, output_len); + } + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + #[track_caller] fn assert_aligned(ptr: *mut impl Sized) { assert!( diff --git a/crates/voicevox_core_c_api/src/helpers.rs b/crates/voicevox_core_c_api/src/helpers.rs index 26a60d033..472784ac0 100644 --- a/crates/voicevox_core_c_api/src/helpers.rs +++ b/crates/voicevox_core_c_api/src/helpers.rs @@ -39,6 +39,7 @@ pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxRes GetSupportedDevices => VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR, StyleNotFound => VOICEVOX_RESULT_STYLE_NOT_FOUND_ERROR, ModelNotFound => VOICEVOX_RESULT_MODEL_NOT_FOUND_ERROR, + UnsupportedModel => VOICEVOX_RESULT_UNSUPPORTED_MODEL_ERROR, // FIXME: dead code RunModel => VOICEVOX_RESULT_RUN_MODEL_ERROR, ExtractFullContextLabel => VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR, ParseKana => VOICEVOX_RESULT_PARSE_KANA_ERROR, diff --git a/crates/voicevox_core_c_api/src/result_code.rs b/crates/voicevox_core_c_api/src/result_code.rs index 645eb289b..cb6ac0ede 100644 --- a/crates/voicevox_core_c_api/src/result_code.rs +++ b/crates/voicevox_core_c_api/src/result_code.rs @@ -22,6 +22,8 @@ pub enum VoicevoxResultCode { VOICEVOX_RESULT_STYLE_NOT_FOUND_ERROR = 6, /// 音声モデルIDに対する音声モデルが見つからなかった VOICEVOX_RESULT_MODEL_NOT_FOUND_ERROR = 7, + /// 対応していないmodelが指定された + VOICEVOX_RESULT_UNSUPPORTED_MODEL_ERROR = !0, // FIXME: dead code /// 推論に失敗した VOICEVOX_RESULT_RUN_MODEL_ERROR = 8, /// コンテキストラベル出力に失敗した @@ -80,6 +82,7 @@ pub(crate) const fn error_result_to_message(result_code: VoicevoxResultCode) -> c"指定されたIDに対する音声モデルが見つかりませんでした。読み込まれていないか、読み込み\ が既に解除されています" } + VOICEVOX_RESULT_UNSUPPORTED_MODEL_ERROR => c"未対応なモデルです", // FIXME: dead code VOICEVOX_RESULT_RUN_MODEL_ERROR => c"推論に失敗しました", VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR => { c"入力テキストからのフルコンテキストラベル抽出に失敗しました" diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml index 218ec2a7b..c45292fad 100644 --- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml +++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml @@ -48,6 +48,26 @@ metas = ''' "version": "0.0.1", "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3", "order": null + }, + { + "name": "dummy4", + "styles": [ + { + "id": 3000, + "name": "style4-1", + "type": "frame_decode", + "order": null + }, + { + "id": 6000, + "name": "style4-2", + "type": "singing_teacher", + "order": null + } + ], + "version": "0.0.1", + "speaker_uuid": "32478dc2-4c8b-44f7-b041-c836e0df6d56", + "order": null } ]''' stderr.windows = ''' @@ -174,6 +194,26 @@ metas = ''' "version": "0.0.1", "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3", "order": null + }, + { + "name": "dummy4", + "styles": [ + { + "id": 3000, + "name": "style4-1", + "type": "frame_decode", + "order": null + }, + { + "id": 6000, + "name": "style4-2", + "type": "singing_teacher", + "order": null + } + ], + "version": "0.0.1", + "speaker_uuid": "32478dc2-4c8b-44f7-b041-c836e0df6d56", + "order": null } ]''' stderr.windows = ''' diff --git a/crates/voicevox_core_java_api/src/common.rs b/crates/voicevox_core_java_api/src/common.rs index cb2a89a7f..ceb4da3d5 100644 --- a/crates/voicevox_core_java_api/src/common.rs +++ b/crates/voicevox_core_java_api/src/common.rs @@ -84,6 +84,7 @@ where GetSupportedDevices, StyleNotFound, ModelNotFound, + UnsupportedModel, // FIXME: dead code RunModel, ExtractFullContextLabel, ParseKana, diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_models.py b/crates/voicevox_core_python_api/python/voicevox_core/_models.py index dc32558fb..68359bf18 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/_models.py +++ b/crates/voicevox_core_python_api/python/voicevox_core/_models.py @@ -41,6 +41,15 @@ class StyleType(str, Enum): TALK = "talk" """音声合成クエリの作成と音声合成が可能。""" + SINGING_TEACHER = "singing_teacher" + """歌唱音声合成用のクエリの作成が可能。""" + + FRAME_DECODE = "frame_decode" + """歌唱音声合成が可能。""" + + SING = "sing" + """歌唱音声合成用のクエリの作成と歌唱音声合成が可能。""" + @pydantic.dataclasses.dataclass class StyleMeta: diff --git a/crates/voicevox_core_python_api/src/convert.rs b/crates/voicevox_core_python_api/src/convert.rs index 711da5fe4..75026e5e0 100644 --- a/crates/voicevox_core_python_api/src/convert.rs +++ b/crates/voicevox_core_python_api/src/convert.rs @@ -233,6 +233,7 @@ pub(crate) impl voicevox_core::Result { ErrorKind::GetSupportedDevices => GetSupportedDevicesError::new_err(msg), ErrorKind::StyleNotFound => StyleNotFoundError::new_err(msg), ErrorKind::ModelNotFound => ModelNotFoundError::new_err(msg), + ErrorKind::UnsupportedModel => unreachable!(), // FIXME: dead code ErrorKind::RunModel => RunModelError::new_err(msg), ErrorKind::ExtractFullContextLabel => ExtractFullContextLabelError::new_err(msg), ErrorKind::ParseKana => ParseKanaError::new_err(msg), diff --git a/model/sample.vvm/manifest.json b/model/sample.vvm/manifest.json index 1075a0797..6161d204a 100644 --- a/model/sample.vvm/manifest.json +++ b/model/sample.vvm/manifest.json @@ -11,5 +11,19 @@ "302": 2, "303": 3 } + }, + "singing_teacher": { + "predict_sing_consonant_length_filename": "predict_sing_consonant_length.onnx", + "predict_sing_f0_filename": "predict_sing_f0.onnx", + "predict_sing_volume_filename": "predict_sing_volume.onnx", + "style_id_to_inner_voice_id": { + "6000": 0 + } + }, + "frame_decode": { + "sf_decode_filename": "sf_decode.onnx", + "style_id_to_inner_voice_id": { + "3000": 0 + } } } diff --git a/model/sample.vvm/metas.json b/model/sample.vvm/metas.json index 08f273fd7..e27a015dd 100644 --- a/model/sample.vvm/metas.json +++ b/model/sample.vvm/metas.json @@ -35,5 +35,22 @@ ], "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3", "version": "0.0.1" + }, + { + "name": "dummy4", + "styles": [ + { + "name": "style4-1", + "id": 3000, + "type": "frame_decode" + }, + { + "name": "style4-2", + "id": 6000, + "type": "singing_teacher" + } + ], + "speaker_uuid": "32478dc2-4c8b-44f7-b041-c836e0df6d56", + "version": "0.0.1" } ] diff --git a/model/sample.vvm/predict_sing_consonant_length.onnx b/model/sample.vvm/predict_sing_consonant_length.onnx new file mode 100644 index 000000000..88a85df7a Binary files /dev/null and b/model/sample.vvm/predict_sing_consonant_length.onnx differ diff --git a/model/sample.vvm/predict_sing_f0.onnx b/model/sample.vvm/predict_sing_f0.onnx new file mode 100644 index 000000000..026c3fb1f Binary files /dev/null and b/model/sample.vvm/predict_sing_f0.onnx differ diff --git a/model/sample.vvm/predict_sing_volume.onnx b/model/sample.vvm/predict_sing_volume.onnx new file mode 100644 index 000000000..d80f97cba Binary files /dev/null and b/model/sample.vvm/predict_sing_volume.onnx differ diff --git a/model/sample.vvm/sf_decode.onnx b/model/sample.vvm/sf_decode.onnx new file mode 100644 index 000000000..169285cb4 Binary files /dev/null and b/model/sample.vvm/sf_decode.onnx differ