VVMが持つトーク用モデルをオプショナルにする

VOICEVOX · Mar 7, 2024 · ca6ce4a · ca6ce4a
1 parent 0848630
commit ca6ce4a
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 33 deletions.
diff --git a/crates/voicevox_core/src/infer/status.rs b/crates/voicevox_core/src/infer/status.rs
@@ -400,7 +400,7 @@ mod tests {
             enum_map!(_ => InferenceSessionOptions::new(0, false)),
         );
         let model = &open_default_vvm_file().await;
-        let model_bytes = &model.read_inference_models().await.unwrap();
+        let model_bytes = &model.read_inference_models().await.unwrap().unwrap();
         let result = status.insert_model(model.header(), model_bytes);
         assert_debug_fmt_eq!(Ok(()), result);
         assert_eq!(1, status.loaded_models.lock().unwrap().0.len());
@@ -414,7 +414,7 @@ mod tests {
         );
         let vvm = open_default_vvm_file().await;
         let model_header = vvm.header();
-        let model_bytes = &vvm.read_inference_models().await.unwrap();
+        let model_bytes = &vvm.read_inference_models().await.unwrap().unwrap();
         assert!(
             !status.is_loaded_model(&model_header.id),
             "model should  not be loaded"

diff --git a/crates/voicevox_core/src/manifest.rs b/crates/voicevox_core/src/manifest.rs
@@ -41,9 +41,14 @@ pub struct Manifest {
     #[allow(dead_code)]
     manifest_version: ManifestVersion,
     metas_filename: String,
-    decode_filename: String,
-    predict_duration_filename: String,
-    predict_intonation_filename: String,
+    talk_model_filenames: Option<TalkModelFilenames>,
     #[serde(default)]
     style_id_to_model_inner_id: BTreeMap<StyleId, ModelInnerId>,
 }
+
+#[derive(Deserialize, Clone)]
+pub(crate) struct TalkModelFilenames {
+    pub(crate) predict_duration: String,
+    pub(crate) predict_intonation: String,
+    pub(crate) decode: String,
+}
diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs
@@ -201,8 +201,10 @@ pub(crate) mod blocking {
 
         /// 音声モデルを読み込む。
         pub fn load_voice_model(&self, model: &crate::blocking::VoiceModel) -> Result<()> {
-            let model_bytes = &model.read_inference_models()?;
-            self.status.insert_model(model.header(), model_bytes)
+            if let Some(model_bytes) = model.read_inference_models()? {
+                self.status.insert_model(model.header(), &model_bytes)?;
+            }
+            Ok(())
         }
 
         /// 音声モデルの読み込みを解除する。
@@ -1157,8 +1159,10 @@ pub(crate) mod tokio {
         }
 
         pub async fn load_voice_model(&self, model: &crate::tokio::VoiceModel) -> Result<()> {
-            let model_bytes = &model.read_inference_models().await?;
-            self.0.status.insert_model(model.header(), model_bytes)
+            if let Some(model_bytes) = model.read_inference_models().await? {
+                self.0.status.insert_model(model.header(), &model_bytes)?;
+            }
+            Ok(())
         }
 
         pub fn unload_voice_model(&self, voice_model_id: &VoiceModelId) -> Result<()> {

diff --git a/crates/voicevox_core/src/test_data/model_sources/load_model_works1/manifest.json b/crates/voicevox_core/src/test_data/model_sources/load_model_works1/manifest.json
@@ -1,9 +1,11 @@
 {
   "manifest_version": "0.0.0",
   "metas_filename": "metas.json",
-  "decode_filename": "decode.onnx",
-  "predict_duration_filename": "predict_duration.onnx",
-  "predict_intonation_filename": "predict_intonation.onnx",
+  "talk_model_filenames": {
+    "predict_duration": "predict_duration.onnx",
+    "predict_intonation": "predict_intonation.onnx",
+    "decode": "decode.onnx"
+  },
   "style_id_to_model_inner_id": {
     "302": 2,
     "303": 3

diff --git a/crates/voicevox_core/src/voice_model.rs b/crates/voicevox_core/src/voice_model.rs
@@ -82,7 +82,7 @@ pub(crate) mod blocking {
     use crate::{
         error::{LoadModelError, LoadModelErrorKind, LoadModelResult},
         infer::domain::InferenceOperationImpl,
-        manifest::Manifest,
+        manifest::{Manifest, TalkModelFilenames},
         VoiceModelMeta,
     };
 
@@ -99,21 +99,30 @@ pub(crate) mod blocking {
     impl self::VoiceModel {
         pub(crate) fn read_inference_models(
             &self,
-        ) -> LoadModelResult<EnumMap<InferenceOperationImpl, Vec<u8>>> {
+        ) -> LoadModelResult<Option<EnumMap<InferenceOperationImpl, Vec<u8>>>> {
             let reader = BlockingVvmEntryReader::open(&self.header.path)?;
 
-            let model_bytes = [
-                self.header.manifest.predict_duration_filename(),
-                self.header.manifest.predict_intonation_filename(),
-                self.header.manifest.decode_filename(),
-            ]
-            .into_par_iter()
-            .map(|filename| reader.read_vvm_entry(filename))
-            .collect::<std::result::Result<Vec<_>, _>>()?
-            .try_into()
-            .unwrap_or_else(|_| panic!("should be same length"));
-
-            Ok(EnumMap::from_array(model_bytes))
+            self.header
+                .manifest
+                .talk_model_filenames()
+                .as_ref()
+                .map(
+                    |TalkModelFilenames {
+                         predict_duration,
+                         predict_intonation,
+                         decode,
+                     }| {
+                        let model_bytes = [predict_duration, predict_intonation, decode]
+                            .into_par_iter()
+                            .map(|filename| reader.read_vvm_entry(filename))
+                            .collect::<std::result::Result<Vec<_>, _>>()?
+                            .try_into()
+                            .unwrap_or_else(|_| panic!("should be same length"));
+
+                        Ok(EnumMap::from_array(model_bytes))
+                    },
+                )
+                .transpose()
         }
 
         /// VVMファイルから`VoiceModel`をコンストラクトする。
@@ -211,7 +220,7 @@ pub(crate) mod tokio {
     use crate::{
         error::{LoadModelError, LoadModelErrorKind, LoadModelResult},
         infer::domain::InferenceOperationImpl,
-        manifest::Manifest,
+        manifest::{Manifest, TalkModelFilenames},
         Result, VoiceModelMeta,
     };
 
@@ -228,24 +237,34 @@ pub(crate) mod tokio {
     impl self::VoiceModel {
         pub(crate) async fn read_inference_models(
             &self,
-        ) -> LoadModelResult<EnumMap<InferenceOperationImpl, Vec<u8>>> {
+        ) -> LoadModelResult<Option<EnumMap<InferenceOperationImpl, Vec<u8>>>> {
             let reader = AsyncVvmEntryReader::open(&self.header.path).await?;
+
+            let Some(TalkModelFilenames {
+                predict_duration,
+                predict_intonation,
+                decode,
+            }) = self.header.manifest.talk_model_filenames()
+            else {
+                return Ok(None);
+            };
+
             let (
                 decode_model_result,
                 predict_duration_model_result,
                 predict_intonation_model_result,
             ) = join3(
-                reader.read_vvm_entry(self.header.manifest.decode_filename()),
-                reader.read_vvm_entry(self.header.manifest.predict_duration_filename()),
-                reader.read_vvm_entry(self.header.manifest.predict_intonation_filename()),
+                reader.read_vvm_entry(decode),
+                reader.read_vvm_entry(predict_duration),
+                reader.read_vvm_entry(predict_intonation),
             )
             .await;
 
-            Ok(EnumMap::from_array([
+            Ok(Some(EnumMap::from_array([
                 predict_duration_model_result?,
                 predict_intonation_model_result?,
                 decode_model_result?,
-            ]))
+            ])))
         }
         /// VVMファイルから`VoiceModel`をコンストラクトする。
         pub async fn from_path(path: impl AsRef<Path>) -> Result<Self> {

diff --git a/model/sample.vvm b/model/sample.vvm