Skip to content

Commit

Permalink
Merge branch 'main' into refactor-make-text-analyzer-pub
Browse files Browse the repository at this point in the history
  • Loading branch information
qryxip committed Jan 11, 2025
2 parents a9cb54d + fda1e71 commit 24449be
Show file tree
Hide file tree
Showing 27 changed files with 612 additions and 145 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/build_and_deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,9 @@ jobs:
- name: set cargo version
run: |
cargo set-version "$VERSION" --exclude voicevox_core_python_api --exclude downloader --exclude xtask
if ${{ matrix.python_whl }}; then cargo set-version "$VERSION" -p voicevox_core_python_api; fi
if ${{ matrix.python_whl }}; then
sed -i_ 's/version = "\(0\.0\.0\)"/version = "'"$VERSION"'"/' ./crates/voicevox_core_python_api/pyproject.toml
fi
- name: cache target
uses: Swatinem/rust-cache@v2
if: ${{ !inputs.is_production }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python_lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
poetry config virtualenvs.create false
- name: Validate poetry.lock
run: |
poetry lock --no-update
poetry lock
git diff --exit-code
- name: Install dependencies
run: poetry install --with test
Expand Down
8 changes: 6 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,13 @@ typetag = "0.2.18"
url = "2.5.4"
uuid = "1.10.0"
voicevox_core = { path = "crates/voicevox_core" }
voicevox_core_macros = { path = "crates/voicevox_core_macros" }
windows = "0.43.0"
zip = "0.6.3"

[workspace.dependencies.voicevox-ort]
git = "https://github.com/VOICEVOX/ort.git"
rev = "09a9fe1619c1561efafc02f68f0bda4aad879771"
rev = "cecd844162a1c6188de03b4566c81d9d38a28600"

[workspace.dependencies.open_jtalk]
git = "https://github.com/VOICEVOX/open_jtalk-rs.git"
Expand All @@ -114,7 +115,7 @@ rev = "de226a26e8e18edbdb1d6f986afe37bbbf35fbf4"
version = "0.0.0"
edition = "2021"
publish = false
rust-version = "1.81.0"
rust-version = "1.84.0"
license = "MIT"

# min-sized-rustを元にrelease buildのサイズが小さくなるようにした
Expand Down
2 changes: 1 addition & 1 deletion crates/voicevox_core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ thiserror.workspace = true
tracing.workspace = true
uuid = { workspace = true, features = ["v4", "serde"] }
voicevox-ort = { workspace = true, features = ["download-binaries", "__init-for-voicevox"] }
voicevox_core_macros = { path = "../voicevox_core_macros" }
voicevox_core_macros.workspace = true

[dev-dependencies]
heck.workspace = true
Expand Down
59 changes: 43 additions & 16 deletions crates/voicevox_core/src/infer/domains.rs
Original file line number Diff line number Diff line change
@@ -1,70 +1,87 @@
pub(crate) mod experimental_talk;
mod frame_decode;
mod singing_teacher;
mod talk;
pub(crate) mod talk;

use educe::Educe;
use serde::{Deserialize, Deserializer};

pub(crate) use self::{
experimental_talk::{
ExperimentalTalkDomain, ExperimentalTalkOperation, GenerateFullIntermediateInput,
GenerateFullIntermediateOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput,
},
frame_decode::{FrameDecodeDomain, FrameDecodeOperation, SfDecodeInput, SfDecodeOutput},
singing_teacher::{
PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput, PredictSingF0Input,
PredictSingF0Output, PredictSingVolumeInput, PredictSingVolumeOutput, SingingTeacherDomain,
SingingTeacherOperation,
},
talk::{
GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput,
PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation,
},
talk::{DecodeInput, DecodeOutput, TalkDomain, TalkOperation},
};

#[derive(Educe)]
// TODO: `bounds`に`V: ?Sized`も入れようとすると、よくわからない理由で弾かれる。最新版のeduce
// でもそうなのか?また最新版でも駄目だとしたら、弾いている理由は何なのか?
#[educe(Clone(
bound = "V: InferenceDomainMapValues, V::Talk: Clone, V::SingingTeacher: Clone, V::FrameDecode: Clone"
bound = "V: InferenceDomainMapValues, V::Talk: Clone, V::ExperimentalTalk: Clone, V::SingingTeacher: Clone, V::FrameDecode: Clone"
))]
pub(crate) struct InferenceDomainMap<V: InferenceDomainMapValues + ?Sized> {
pub(crate) talk: V::Talk,
pub(crate) experimental_talk: V::ExperimentalTalk,
pub(crate) singing_teacher: V::SingingTeacher,
pub(crate) frame_decode: V::FrameDecode,
}

impl<T, S, F> InferenceDomainMap<(T, S, F)> {
pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T, &S, &F)> {
impl<T, X, S, F> InferenceDomainMap<(T, X, S, F)> {
pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T, &X, &S, &F)> {
let talk = &self.talk;
let experimental_talk = &self.experimental_talk;
let singing_teacher = &self.singing_teacher;
let frame_decode = &self.frame_decode;
InferenceDomainMap {
talk,
experimental_talk,
singing_teacher,
frame_decode,
}
}

pub(crate) fn map<T2, S2, F2, Ft: FnOnce(T) -> T2, Fs: FnOnce(S) -> S2, Ff: FnOnce(F) -> F2>(
pub(crate) fn map<
T2,
X2,
S2,
F2,
Ft: FnOnce(T) -> T2,
Fx: FnOnce(X) -> X2,
Fs: FnOnce(S) -> S2,
Ff: FnOnce(F) -> F2,
>(
self,
fs: InferenceDomainMap<(Ft, Fs, Ff)>,
) -> InferenceDomainMap<(T2, S2, F2)> {
fs: InferenceDomainMap<(Ft, Fx, Fs, Ff)>,
) -> InferenceDomainMap<(T2, X2, S2, F2)> {
let talk = (fs.talk)(self.talk);
let experimental_talk = (fs.experimental_talk)(self.experimental_talk);
let singing_teacher = (fs.singing_teacher)(self.singing_teacher);
let frame_decode = (fs.frame_decode)(self.frame_decode);
InferenceDomainMap {
talk,
experimental_talk,
singing_teacher,
frame_decode,
}
}
}

impl<T, S, F, E> InferenceDomainMap<(Result<T, E>, Result<S, E>, Result<F, E>)> {
pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T, S, F)>, E> {
impl<T, X, S, F, E> InferenceDomainMap<(Result<T, E>, Result<X, E>, Result<S, E>, Result<F, E>)> {
pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T, X, S, F)>, E> {
let talk = self.talk?;
let experimental_talk = self.experimental_talk?;
let singing_teacher = self.singing_teacher?;
let frame_decode = self.frame_decode?;
Ok(InferenceDomainMap {
talk,
experimental_talk,
singing_teacher,
frame_decode,
})
Expand All @@ -74,6 +91,7 @@ impl<T, S, F, E> InferenceDomainMap<(Result<T, E>, Result<S, E>, Result<F, E>)>
impl<'de, V: InferenceDomainMapValues + ?Sized> Deserialize<'de> for InferenceDomainMap<V>
where
V::Talk: Deserialize<'de>,
V::ExperimentalTalk: Deserialize<'de>,
V::SingingTeacher: Deserialize<'de>,
V::FrameDecode: Deserialize<'de>,
{
Expand All @@ -83,18 +101,21 @@ where
{
let Repr {
talk,
experimental_talk,
singing_teacher,
frame_decode,
} = Repr::deserialize(deserializer)?;
return Ok(Self {
talk,
experimental_talk,
singing_teacher,
frame_decode,
});

#[derive(Deserialize)]
struct Repr<T, S, F> {
struct Repr<T, E, S, F> {
talk: T,
experimental_talk: E,
singing_teacher: S,
frame_decode: F,
}
Expand All @@ -103,12 +124,14 @@ where

pub(crate) trait InferenceDomainMapValues {
type Talk;
type ExperimentalTalk;
type SingingTeacher;
type FrameDecode;
}

impl<T, S, F> InferenceDomainMapValues for (T, S, F) {
impl<T, X, S, F> InferenceDomainMapValues for (T, X, S, F) {
type Talk = T;
type ExperimentalTalk = X;
type SingingTeacher = S;
type FrameDecode = F;
}
Expand All @@ -120,6 +143,10 @@ macro_rules! inference_domain_map_values {
$body
where $arg = crate::infer::domains::TalkDomain as crate::infer::InferenceDomain
),
::macros::substitute_type!(
$body
where $arg = crate::infer::domains::ExperimentalTalkDomain as crate::infer::InferenceDomain
),
::macros::substitute_type!(
$body
where $arg = crate::infer::domains::SingingTeacherDomain as crate::infer::InferenceDomain
Expand Down
116 changes: 116 additions & 0 deletions crates/voicevox_core/src/infer/domains/experimental_talk.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
use std::{collections::BTreeSet, sync::LazyLock};

use enum_map::Enum;
use macros::{InferenceInputSignature, InferenceOperation, InferenceOutputSignature};
use ndarray::{Array0, Array1, Array2};

use crate::{manifest::ExperimentalTalkManifest, StyleType};

use super::super::{
InferenceDomain, InferenceInputSignature as _, InferenceOutputSignature as _, OutputTensor,
};

pub(crate) enum ExperimentalTalkDomain {}

impl InferenceDomain for ExperimentalTalkDomain {
type Operation = ExperimentalTalkOperation;
type Manifest = ExperimentalTalkManifest;

fn style_types() -> &'static BTreeSet<StyleType> {
static STYLE_TYPES: LazyLock<BTreeSet<StyleType>> =
LazyLock::new(|| [StyleType::Talk].into());
&STYLE_TYPES
}
}

#[derive(Clone, Copy, Enum, InferenceOperation)]
#[inference_operation(
type Domain = ExperimentalTalkDomain;
)]
pub(crate) enum ExperimentalTalkOperation {
#[inference_operation(
type Input = PredictDurationInput;
type Output = PredictDurationOutput;
)]
PredictDuration,

#[inference_operation(
type Input = PredictIntonationInput;
type Output = PredictIntonationOutput;
)]
PredictIntonation,

#[inference_operation(
type Input = GenerateFullIntermediateInput;
type Output = GenerateFullIntermediateOutput;
)]
GenerateFullIntermediate,

#[inference_operation(
type Input = RenderAudioSegmentInput;
type Output = RenderAudioSegmentOutput;
)]
RenderAudioSegment,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = PredictDuration;
)]
pub(crate) struct PredictDurationInput {
pub(crate) phoneme_list: Array1<i64>,
pub(crate) speaker_id: Array1<i64>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct PredictDurationOutput {
pub(crate) phoneme_length: Array1<f32>,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = PredictIntonation;
)]
pub(crate) struct PredictIntonationInput {
pub(crate) length: Array0<i64>,
pub(crate) vowel_phoneme_list: Array1<i64>,
pub(crate) consonant_phoneme_list: Array1<i64>,
pub(crate) start_accent_list: Array1<i64>,
pub(crate) end_accent_list: Array1<i64>,
pub(crate) start_accent_phrase_list: Array1<i64>,
pub(crate) end_accent_phrase_list: Array1<i64>,
pub(crate) speaker_id: Array1<i64>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct PredictIntonationOutput {
pub(crate) f0_list: Array1<f32>,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = GenerateFullIntermediate;
)]
pub(crate) struct GenerateFullIntermediateInput {
pub(crate) f0: Array2<f32>,
pub(crate) phoneme: Array2<f32>,
pub(crate) speaker_id: Array1<i64>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct GenerateFullIntermediateOutput {
pub(crate) spec: Array2<f32>,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = RenderAudioSegment;
)]
pub(crate) struct RenderAudioSegmentInput {
pub(crate) spec: Array2<f32>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct RenderAudioSegmentOutput {
pub(crate) wave: Array1<f32>,
}
Loading

0 comments on commit 24449be

Please sign in to comment.