Merge branch 'main' into refactor-make-text-analyzer-pub

VOICEVOX · Jan 11, 2025 · 24449be · 24449be
2 parents a9cb54d + fda1e71
commit 24449be
Show file tree

Hide file tree

Showing 27 changed files with 612 additions and 145 deletions.
diff --git a/.github/workflows/build_and_deploy.yml b/.github/workflows/build_and_deploy.yml
@@ -225,7 +225,9 @@ jobs:
       - name: set cargo version
         run: |
           cargo set-version "$VERSION" --exclude voicevox_core_python_api --exclude downloader --exclude xtask
-          if ${{ matrix.python_whl }}; then cargo set-version "$VERSION" -p voicevox_core_python_api; fi
+          if ${{ matrix.python_whl }}; then
+            sed -i_ 's/version = "\(0\.0\.0\)"/version = "'"$VERSION"'"/' ./crates/voicevox_core_python_api/pyproject.toml
+          fi
       - name: cache target
         uses: Swatinem/rust-cache@v2
         if: ${{ !inputs.is_production }}

diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml
@@ -31,7 +31,7 @@ jobs:
           poetry config virtualenvs.create false
       - name: Validate poetry.lock
         run: |
-          poetry lock --no-update
+          poetry lock
           git diff --exit-code
       - name: Install dependencies
         run: poetry install --with test

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -94,12 +94,13 @@ typetag = "0.2.18"
 url = "2.5.4"
 uuid = "1.10.0"
 voicevox_core = { path = "crates/voicevox_core" }
+voicevox_core_macros = { path = "crates/voicevox_core_macros" }
 windows = "0.43.0"
 zip = "0.6.3"
 
 [workspace.dependencies.voicevox-ort]
 git = "https://github.com/VOICEVOX/ort.git"
-rev = "09a9fe1619c1561efafc02f68f0bda4aad879771"
+rev = "cecd844162a1c6188de03b4566c81d9d38a28600"
 
 [workspace.dependencies.open_jtalk]
 git = "https://github.com/VOICEVOX/open_jtalk-rs.git"
@@ -114,7 +115,7 @@ rev = "de226a26e8e18edbdb1d6f986afe37bbbf35fbf4"
 version = "0.0.0"
 edition = "2021"
 publish = false
-rust-version = "1.81.0"
+rust-version = "1.84.0"
 license = "MIT"
 
 # min-sized-rustを元にrelease buildのサイズが小さくなるようにした

diff --git a/crates/voicevox_core/Cargo.toml b/crates/voicevox_core/Cargo.toml
@@ -52,7 +52,7 @@ thiserror.workspace = true
 tracing.workspace = true
 uuid = { workspace = true, features = ["v4", "serde"] }
 voicevox-ort = { workspace = true, features = ["download-binaries", "__init-for-voicevox"] }
-voicevox_core_macros = { path = "../voicevox_core_macros" }
+voicevox_core_macros.workspace = true
 
 [dev-dependencies]
 heck.workspace = true

diff --git a/crates/voicevox_core/src/infer/domains.rs b/crates/voicevox_core/src/infer/domains.rs
@@ -1,70 +1,87 @@
+pub(crate) mod experimental_talk;
 mod frame_decode;
 mod singing_teacher;
-mod talk;
+pub(crate) mod talk;
 
 use educe::Educe;
 use serde::{Deserialize, Deserializer};
 
 pub(crate) use self::{
+    experimental_talk::{
+        ExperimentalTalkDomain, ExperimentalTalkOperation, GenerateFullIntermediateInput,
+        GenerateFullIntermediateOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput,
+    },
     frame_decode::{FrameDecodeDomain, FrameDecodeOperation, SfDecodeInput, SfDecodeOutput},
     singing_teacher::{
         PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput, PredictSingF0Input,
         PredictSingF0Output, PredictSingVolumeInput, PredictSingVolumeOutput, SingingTeacherDomain,
         SingingTeacherOperation,
     },
-    talk::{
-        GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput,
-        PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
-        RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation,
-    },
+    talk::{DecodeInput, DecodeOutput, TalkDomain, TalkOperation},
 };
 
 #[derive(Educe)]
 // TODO: `bounds`に`V: ?Sized`も入れようとすると、よくわからない理由で弾かれる。最新版のeduce
 // でもそうなのか？また最新版でも駄目だとしたら、弾いている理由は何なのか？
 #[educe(Clone(
-    bound = "V: InferenceDomainMapValues, V::Talk: Clone, V::SingingTeacher: Clone, V::FrameDecode: Clone"
+    bound = "V: InferenceDomainMapValues, V::Talk: Clone, V::ExperimentalTalk: Clone, V::SingingTeacher: Clone, V::FrameDecode: Clone"
 ))]
 pub(crate) struct InferenceDomainMap<V: InferenceDomainMapValues + ?Sized> {
     pub(crate) talk: V::Talk,
+    pub(crate) experimental_talk: V::ExperimentalTalk,
     pub(crate) singing_teacher: V::SingingTeacher,
     pub(crate) frame_decode: V::FrameDecode,
 }
 
-impl<T, S, F> InferenceDomainMap<(T, S, F)> {
-    pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T, &S, &F)> {
+impl<T, X, S, F> InferenceDomainMap<(T, X, S, F)> {
+    pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T, &X, &S, &F)> {
         let talk = &self.talk;
+        let experimental_talk = &self.experimental_talk;
         let singing_teacher = &self.singing_teacher;
         let frame_decode = &self.frame_decode;
         InferenceDomainMap {
             talk,
+            experimental_talk,
             singing_teacher,
             frame_decode,
         }
     }
 
-    pub(crate) fn map<T2, S2, F2, Ft: FnOnce(T) -> T2, Fs: FnOnce(S) -> S2, Ff: FnOnce(F) -> F2>(
+    pub(crate) fn map<
+        T2,
+        X2,
+        S2,
+        F2,
+        Ft: FnOnce(T) -> T2,
+        Fx: FnOnce(X) -> X2,
+        Fs: FnOnce(S) -> S2,
+        Ff: FnOnce(F) -> F2,
+    >(
         self,
-        fs: InferenceDomainMap<(Ft, Fs, Ff)>,
-    ) -> InferenceDomainMap<(T2, S2, F2)> {
+        fs: InferenceDomainMap<(Ft, Fx, Fs, Ff)>,
+    ) -> InferenceDomainMap<(T2, X2, S2, F2)> {
         let talk = (fs.talk)(self.talk);
+        let experimental_talk = (fs.experimental_talk)(self.experimental_talk);
         let singing_teacher = (fs.singing_teacher)(self.singing_teacher);
         let frame_decode = (fs.frame_decode)(self.frame_decode);
         InferenceDomainMap {
             talk,
+            experimental_talk,
             singing_teacher,
             frame_decode,
         }
     }
 }
 
-impl<T, S, F, E> InferenceDomainMap<(Result<T, E>, Result<S, E>, Result<F, E>)> {
-    pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T, S, F)>, E> {
+impl<T, X, S, F, E> InferenceDomainMap<(Result<T, E>, Result<X, E>, Result<S, E>, Result<F, E>)> {
+    pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T, X, S, F)>, E> {
         let talk = self.talk?;
+        let experimental_talk = self.experimental_talk?;
         let singing_teacher = self.singing_teacher?;
         let frame_decode = self.frame_decode?;
         Ok(InferenceDomainMap {
             talk,
+            experimental_talk,
             singing_teacher,
             frame_decode,
         })
@@ -74,6 +91,7 @@ impl<T, S, F, E> InferenceDomainMap<(Result<T, E>, Result<S, E>, Result<F, E>)>
 impl<'de, V: InferenceDomainMapValues + ?Sized> Deserialize<'de> for InferenceDomainMap<V>
 where
     V::Talk: Deserialize<'de>,
+    V::ExperimentalTalk: Deserialize<'de>,
     V::SingingTeacher: Deserialize<'de>,
     V::FrameDecode: Deserialize<'de>,
 {
@@ -83,18 +101,21 @@ where
     {
         let Repr {
             talk,
+            experimental_talk,
             singing_teacher,
             frame_decode,
         } = Repr::deserialize(deserializer)?;
         return Ok(Self {
             talk,
+            experimental_talk,
             singing_teacher,
             frame_decode,
         });
 
         #[derive(Deserialize)]
-        struct Repr<T, S, F> {
+        struct Repr<T, E, S, F> {
             talk: T,
+            experimental_talk: E,
             singing_teacher: S,
             frame_decode: F,
         }
@@ -103,12 +124,14 @@ where
 
 pub(crate) trait InferenceDomainMapValues {
     type Talk;
+    type ExperimentalTalk;
     type SingingTeacher;
     type FrameDecode;
 }
 
-impl<T, S, F> InferenceDomainMapValues for (T, S, F) {
+impl<T, X, S, F> InferenceDomainMapValues for (T, X, S, F) {
     type Talk = T;
+    type ExperimentalTalk = X;
     type SingingTeacher = S;
     type FrameDecode = F;
 }
@@ -120,6 +143,10 @@ macro_rules! inference_domain_map_values {
                 $body
                 where $arg = crate::infer::domains::TalkDomain as crate::infer::InferenceDomain
             ),
+            ::macros::substitute_type!(
+                $body
+                where $arg = crate::infer::domains::ExperimentalTalkDomain as crate::infer::InferenceDomain
+            ),
             ::macros::substitute_type!(
                 $body
                 where $arg = crate::infer::domains::SingingTeacherDomain as crate::infer::InferenceDomain

diff --git a/crates/voicevox_core/src/infer/domains/experimental_talk.rs b/crates/voicevox_core/src/infer/domains/experimental_talk.rs
@@ -0,0 +1,116 @@
+use std::{collections::BTreeSet, sync::LazyLock};
+
+use enum_map::Enum;
+use macros::{InferenceInputSignature, InferenceOperation, InferenceOutputSignature};
+use ndarray::{Array0, Array1, Array2};
+
+use crate::{manifest::ExperimentalTalkManifest, StyleType};
+
+use super::super::{
+    InferenceDomain, InferenceInputSignature as _, InferenceOutputSignature as _, OutputTensor,
+};
+
+pub(crate) enum ExperimentalTalkDomain {}
+
+impl InferenceDomain for ExperimentalTalkDomain {
+    type Operation = ExperimentalTalkOperation;
+    type Manifest = ExperimentalTalkManifest;
+
+    fn style_types() -> &'static BTreeSet<StyleType> {
+        static STYLE_TYPES: LazyLock<BTreeSet<StyleType>> =
+            LazyLock::new(|| [StyleType::Talk].into());
+        &STYLE_TYPES
+    }
+}
+
+#[derive(Clone, Copy, Enum, InferenceOperation)]
+#[inference_operation(
+    type Domain = ExperimentalTalkDomain;
+)]
+pub(crate) enum ExperimentalTalkOperation {
+    #[inference_operation(
+        type Input = PredictDurationInput;
+        type Output = PredictDurationOutput;
+    )]
+    PredictDuration,
+
+    #[inference_operation(
+        type Input = PredictIntonationInput;
+        type Output = PredictIntonationOutput;
+    )]
+    PredictIntonation,
+
+    #[inference_operation(
+        type Input = GenerateFullIntermediateInput;
+        type Output = GenerateFullIntermediateOutput;
+    )]
+    GenerateFullIntermediate,
+
+    #[inference_operation(
+        type Input = RenderAudioSegmentInput;
+        type Output = RenderAudioSegmentOutput;
+    )]
+    RenderAudioSegment,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = PredictDuration;
+)]
+pub(crate) struct PredictDurationInput {
+    pub(crate) phoneme_list: Array1<i64>,
+    pub(crate) speaker_id: Array1<i64>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct PredictDurationOutput {
+    pub(crate) phoneme_length: Array1<f32>,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = PredictIntonation;
+)]
+pub(crate) struct PredictIntonationInput {
+    pub(crate) length: Array0<i64>,
+    pub(crate) vowel_phoneme_list: Array1<i64>,
+    pub(crate) consonant_phoneme_list: Array1<i64>,
+    pub(crate) start_accent_list: Array1<i64>,
+    pub(crate) end_accent_list: Array1<i64>,
+    pub(crate) start_accent_phrase_list: Array1<i64>,
+    pub(crate) end_accent_phrase_list: Array1<i64>,
+    pub(crate) speaker_id: Array1<i64>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct PredictIntonationOutput {
+    pub(crate) f0_list: Array1<f32>,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = GenerateFullIntermediate;
+)]
+pub(crate) struct GenerateFullIntermediateInput {
+    pub(crate) f0: Array2<f32>,
+    pub(crate) phoneme: Array2<f32>,
+    pub(crate) speaker_id: Array1<i64>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct GenerateFullIntermediateOutput {
+    pub(crate) spec: Array2<f32>,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = RenderAudioSegment;
+)]
+pub(crate) struct RenderAudioSegmentInput {
+    pub(crate) spec: Array2<f32>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct RenderAudioSegmentOutput {
+    pub(crate) wave: Array1<f32>,
+}