Skip to content

Commit

Permalink
change: style_id_to_model_inner_idstyle_id_to_inner_voice_id (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
qryxip authored May 23, 2024
1 parent 5a644ca commit 9c3a94e
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 52 deletions.
16 changes: 8 additions & 8 deletions crates/voicevox_core/src/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,18 @@ impl Display for ManifestVersion {
}

/// モデル内IDの実体
pub type RawModelInnerId = u32;
pub type RawInnerVoiceId = u32;
/// モデル内ID
#[derive(PartialEq, Eq, Clone, Copy, Ord, PartialOrd, Deserialize, Serialize, new, Debug)]
pub struct ModelInnerId(RawModelInnerId);
pub struct InnerVoiceId(RawInnerVoiceId);

impl ModelInnerId {
pub fn raw_id(self) -> RawModelInnerId {
impl InnerVoiceId {
pub fn raw_id(self) -> RawInnerVoiceId {
self.0
}
}

impl Display for ModelInnerId {
impl Display for InnerVoiceId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.raw_id())
}
Expand Down Expand Up @@ -58,12 +58,12 @@ pub(crate) struct TalkManifest {
pub(crate) predict_intonation_filename: String,
pub(crate) decode_filename: String,
#[serde(default)]
pub(crate) style_id_to_model_inner_id: StyleIdToModelInnerId,
pub(crate) style_id_to_inner_voice_id: StyleIdToInnerVoiceId,
}

#[serde_as]
#[derive(Default, Clone, Deref, Deserialize)]
#[deref(forward)]
pub(crate) struct StyleIdToModelInnerId(
#[serde_as(as = "Arc<BTreeMap<DisplayFromStr, _>>")] Arc<BTreeMap<StyleId, ModelInnerId>>,
pub(crate) struct StyleIdToInnerVoiceId(
#[serde_as(as = "Arc<BTreeMap<DisplayFromStr, _>>")] Arc<BTreeMap<StyleId, InnerVoiceId>>,
);
50 changes: 25 additions & 25 deletions crates/voicevox_core/src/status.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ use crate::{
InferenceDomain, InferenceInputSignature, InferenceRuntime, InferenceSessionOptions,
InferenceSignature,
},
manifest::{ModelInnerId, StyleIdToModelInnerId},
manifest::{InnerVoiceId, StyleIdToInnerVoiceId},
metas::{self, SpeakerMeta, StyleId, StyleMeta, VoiceModelMeta},
voice_model::{ModelBytesWithInnerIdsByDomain, VoiceModelHeader, VoiceModelId},
voice_model::{ModelBytesWithInnerVoiceIdsByDomain, VoiceModelHeader, VoiceModelId},
Result,
};

Expand All @@ -36,7 +36,7 @@ impl<R: InferenceRuntime> Status<R> {
pub(crate) fn insert_model(
&self,
model_header: &VoiceModelHeader,
model_contents: &InferenceDomainMap<ModelBytesWithInnerIdsByDomain>,
model_contents: &InferenceDomainMap<ModelBytesWithInnerVoiceIdsByDomain>,
) -> Result<()> {
self.loaded_models
.lock()
Expand Down Expand Up @@ -66,14 +66,14 @@ impl<R: InferenceRuntime> Status<R> {
self.loaded_models.lock().unwrap().metas()
}

/// あるスタイルに対応する`VoiceModelId`と`ModelInnerId`の組を返す。
/// あるスタイルに対応する`VoiceModelId`と`InnerVoiceId`の組を返す。
///
/// `StyleId` → `ModelInnerId`のマッピングが存在しない場合は、`ModelInnerId`としては
/// `StyleId` → `InnerVoiceId`のマッピングが存在しない場合は、`InnerVoiceId`としては
/// `style_id`と同じ値を返す。
pub(crate) fn ids_for<D: InferenceDomainExt>(
&self,
style_id: StyleId,
) -> Result<(VoiceModelId, ModelInnerId)> {
) -> Result<(VoiceModelId, InnerVoiceId)> {
self.loaded_models.lock().unwrap().ids_for::<D>(style_id)
}

Expand Down Expand Up @@ -122,7 +122,7 @@ struct LoadedModels<R: InferenceRuntime>(IndexMap<VoiceModelId, LoadedModel<R>>)

struct LoadedModel<R: InferenceRuntime> {
metas: VoiceModelMeta,
session_sets_with_inner_ids: InferenceDomainMap<SessionSetsWithInnerIdsByDomain<R>>,
session_sets_with_inner_ids: InferenceDomainMap<SessionSetsWithInnerVoiceIdsByDomain<R>>,
}

impl<R: InferenceRuntime> LoadedModels<R> {
Expand All @@ -133,7 +133,7 @@ impl<R: InferenceRuntime> LoadedModels<R> {
fn ids_for<D: InferenceDomainExt>(
&self,
style_id: StyleId,
) -> Result<(VoiceModelId, ModelInnerId)> {
) -> Result<(VoiceModelId, InnerVoiceId)> {
let (
model_id,
LoadedModel {
Expand All @@ -153,13 +153,13 @@ impl<R: InferenceRuntime> LoadedModels<R> {
style_types: D::style_types(),
})?;

let model_inner_id = session_sets_with_inner_ids
let inner_voice_id = session_sets_with_inner_ids
.get::<D>()
.as_ref()
.and_then(|(model_inner_ids, _)| model_inner_ids.get(&style_id).copied())
.unwrap_or_else(|| ModelInnerId::new(style_id.raw_id()));
.and_then(|(inner_voice_ids, _)| inner_voice_ids.get(&style_id).copied())
.unwrap_or_else(|| InnerVoiceId::new(style_id.raw_id()));

Ok((model_id.clone(), model_inner_id))
Ok((model_id.clone(), inner_voice_id))
}

/// # Panics
Expand Down Expand Up @@ -250,7 +250,7 @@ impl<R: InferenceRuntime> LoadedModels<R> {
fn insert(
&mut self,
model_header: &VoiceModelHeader,
session_sets_with_inner_ids: InferenceDomainMap<SessionSetsWithInnerIdsByDomain<R>>,
session_sets_with_inner_ids: InferenceDomainMap<SessionSetsWithInnerVoiceIdsByDomain<R>>,
) -> Result<()> {
self.ensure_acceptable(model_header)?;

Expand Down Expand Up @@ -286,8 +286,8 @@ impl<R: InferenceRuntime> LoadedModels<R> {

pub(crate) trait InferenceDomainExt: InferenceDomain {
fn visit<R: InferenceRuntime>(
map: &InferenceDomainMap<SessionSetsWithInnerIdsByDomain<R>>,
) -> Option<&(StyleIdToModelInnerId, InferenceSessionSet<R, Self>)>;
map: &InferenceDomainMap<SessionSetsWithInnerVoiceIdsByDomain<R>>,
) -> Option<&(StyleIdToInnerVoiceId, InferenceSessionSet<R, Self>)>;
}

#[duplicate_item(
Expand All @@ -296,25 +296,25 @@ pub(crate) trait InferenceDomainExt: InferenceDomain {
)]
impl InferenceDomainExt for T {
fn visit<R: InferenceRuntime>(
map: &InferenceDomainMap<SessionSetsWithInnerIdsByDomain<R>>,
) -> Option<&(StyleIdToModelInnerId, InferenceSessionSet<R, Self>)> {
map: &InferenceDomainMap<SessionSetsWithInnerVoiceIdsByDomain<R>>,
) -> Option<&(StyleIdToInnerVoiceId, InferenceSessionSet<R, Self>)> {
map.field.as_ref()
}
}

impl<R: InferenceRuntime> InferenceDomainMap<SessionSetsWithInnerIdsByDomain<R>> {
impl<R: InferenceRuntime> InferenceDomainMap<SessionSetsWithInnerVoiceIdsByDomain<R>> {
fn get<D: InferenceDomainExt>(
&self,
) -> Option<&(StyleIdToModelInnerId, InferenceSessionSet<R, D>)> {
) -> Option<&(StyleIdToInnerVoiceId, InferenceSessionSet<R, D>)> {
D::visit(self)
}
}

impl InferenceDomainMap<ModelBytesWithInnerIdsByDomain> {
impl InferenceDomainMap<ModelBytesWithInnerVoiceIdsByDomain> {
fn create_session_sets<R: InferenceRuntime>(
&self,
session_options: &InferenceDomainMap<SessionOptionsByDomain>,
) -> anyhow::Result<InferenceDomainMap<SessionSetsWithInnerIdsByDomain<R>>> {
) -> anyhow::Result<InferenceDomainMap<SessionSetsWithInnerVoiceIdsByDomain<R>>> {
duplicate! {
[
field;
Expand All @@ -323,9 +323,9 @@ impl InferenceDomainMap<ModelBytesWithInnerIdsByDomain> {
let field = self
.field
.as_ref()
.map(|(model_inner_ids, model_bytes)| {
.map(|(inner_voice_ids, model_bytes)| {
let session_set = InferenceSessionSet::new(model_bytes, &session_options.field)?;
Ok::<_, anyhow::Error>((model_inner_ids.clone(), session_set))
Ok::<_, anyhow::Error>((inner_voice_ids.clone(), session_set))
})
.transpose()?;
}
Expand All @@ -336,8 +336,8 @@ impl InferenceDomainMap<ModelBytesWithInnerIdsByDomain> {

type SessionOptionsByDomain = (EnumMap<TalkOperation, InferenceSessionOptions>,);

type SessionSetsWithInnerIdsByDomain<R> =
(Option<(StyleIdToModelInnerId, InferenceSessionSet<R, TalkDomain>)>,);
type SessionSetsWithInnerVoiceIdsByDomain<R> =
(Option<(StyleIdToInnerVoiceId, InferenceSessionSet<R, TalkDomain>)>,);

#[cfg(test)]
mod tests {
Expand Down
12 changes: 6 additions & 6 deletions crates/voicevox_core/src/synthesizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -836,15 +836,15 @@ pub(crate) mod blocking {

impl<O> PerformInference for self::Synthesizer<O> {
fn predict_duration(&self, phoneme_vector: &[i64], style_id: StyleId) -> Result<Vec<f32>> {
let (model_id, model_inner_id) = self.status.ids_for::<TalkDomain>(style_id)?;
let (model_id, inner_voice_id) = self.status.ids_for::<TalkDomain>(style_id)?;

let PredictDurationOutput {
phoneme_length: output,
} = self.status.run_session(
&model_id,
PredictDurationInput {
phoneme_list: ndarray::arr1(phoneme_vector),
speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]),
speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
},
)?;
let mut output = output.into_raw_vec();
Expand All @@ -871,7 +871,7 @@ pub(crate) mod blocking {
end_accent_phrase_vector: &[i64],
style_id: StyleId,
) -> Result<Vec<f32>> {
let (model_id, model_inner_id) = self.status.ids_for::<TalkDomain>(style_id)?;
let (model_id, inner_voice_id) = self.status.ids_for::<TalkDomain>(style_id)?;

let PredictIntonationOutput { f0_list: output } = self.status.run_session(
&model_id,
Expand All @@ -883,7 +883,7 @@ pub(crate) mod blocking {
end_accent_list: ndarray::arr1(end_accent_vector),
start_accent_phrase_list: ndarray::arr1(start_accent_phrase_vector),
end_accent_phrase_list: ndarray::arr1(end_accent_phrase_vector),
speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]),
speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
},
)?;

Expand All @@ -898,7 +898,7 @@ pub(crate) mod blocking {
phoneme_vector: &[f32],
style_id: StyleId,
) -> Result<Vec<f32>> {
let (model_id, model_inner_id) = self.status.ids_for::<TalkDomain>(style_id)?;
let (model_id, inner_voice_id) = self.status.ids_for::<TalkDomain>(style_id)?;

// 音が途切れてしまうのを避けるworkaround処理が入っている
// TODO: 改善したらここのpadding処理を取り除く
Expand All @@ -925,7 +925,7 @@ pub(crate) mod blocking {
phoneme: ndarray::arr1(&phoneme_with_padding)
.into_shape([length_with_padding, phoneme_size])
.unwrap(),
speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]),
speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
},
)?;

Expand Down
24 changes: 12 additions & 12 deletions crates/voicevox_core/src/voice_model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use crate::{
domains::{TalkDomain, TalkOperation},
InferenceDomain,
},
manifest::{Manifest, ManifestDomains, StyleIdToModelInnerId},
manifest::{Manifest, ManifestDomains, StyleIdToInnerVoiceId},
SpeakerMeta, StyleMeta, StyleType, VoiceModelMeta,
};
use std::path::{Path, PathBuf};
Expand All @@ -26,8 +26,8 @@ use std::path::{Path, PathBuf};
/// [`VoiceModelId`]: VoiceModelId
pub type RawVoiceModelId = String;

pub(crate) type ModelBytesWithInnerIdsByDomain =
(Option<(StyleIdToModelInnerId, EnumMap<TalkOperation, Vec<u8>>)>,);
pub(crate) type ModelBytesWithInnerVoiceIdsByDomain =
(Option<(StyleIdToInnerVoiceId, EnumMap<TalkOperation, Vec<u8>>)>,);

/// 音声モデルID。
#[derive(
Expand Down Expand Up @@ -164,7 +164,7 @@ pub(crate) mod blocking {
VoiceModelMeta,
};

use super::{ModelBytesWithInnerIdsByDomain, VoiceModelHeader, VoiceModelId};
use super::{ModelBytesWithInnerVoiceIdsByDomain, VoiceModelHeader, VoiceModelId};

/// 音声モデル。
///
Expand All @@ -177,7 +177,7 @@ pub(crate) mod blocking {
impl self::VoiceModel {
pub(crate) fn read_inference_models(
&self,
) -> LoadModelResult<InferenceDomainMap<ModelBytesWithInnerIdsByDomain>> {
) -> LoadModelResult<InferenceDomainMap<ModelBytesWithInnerVoiceIdsByDomain>> {
let reader = BlockingVvmEntryReader::open(&self.header.path)?;

let talk = self
Expand All @@ -191,7 +191,7 @@ pub(crate) mod blocking {
predict_duration_filename,
predict_intonation_filename,
decode_filename,
style_id_to_model_inner_id,
style_id_to_inner_voice_id,
}| {
let model_bytes = [
predict_duration_filename,
Expand All @@ -206,7 +206,7 @@ pub(crate) mod blocking {

let model_bytes = EnumMap::from_array(model_bytes);

Ok((style_id_to_model_inner_id.clone(), model_bytes))
Ok((style_id_to_inner_voice_id.clone(), model_bytes))
},
)
.transpose()?;
Expand Down Expand Up @@ -307,7 +307,7 @@ pub(crate) mod tokio {
Result, VoiceModelMeta,
};

use super::{ModelBytesWithInnerIdsByDomain, VoiceModelHeader, VoiceModelId};
use super::{ModelBytesWithInnerVoiceIdsByDomain, VoiceModelHeader, VoiceModelId};

/// 音声モデル。
///
Expand All @@ -320,15 +320,15 @@ pub(crate) mod tokio {
impl self::VoiceModel {
pub(crate) async fn read_inference_models(
&self,
) -> LoadModelResult<InferenceDomainMap<ModelBytesWithInnerIdsByDomain>> {
) -> LoadModelResult<InferenceDomainMap<ModelBytesWithInnerVoiceIdsByDomain>> {
let reader = AsyncVvmEntryReader::open(&self.header.path).await?;

let talk = OptionFuture::from(self.header.manifest.domains().talk.as_ref().map(
|TalkManifest {
predict_duration_filename,
predict_intonation_filename,
decode_filename,
style_id_to_model_inner_id,
style_id_to_inner_voice_id,
}| async {
let (
decode_model_result,
Expand All @@ -347,7 +347,7 @@ pub(crate) mod tokio {
decode_model_result?,
]);

Ok((style_id_to_model_inner_id.clone(), model_bytes))
Ok((style_id_to_inner_voice_id.clone(), model_bytes))
},
))
.await
Expand Down Expand Up @@ -505,7 +505,7 @@ mod tests {
predict_duration_filename: "".to_owned(),
predict_intonation_filename: "".to_owned(),
decode_filename: "".to_owned(),
style_id_to_model_inner_id: Default::default(),
style_id_to_inner_voice_id: Default::default(),
});

#[fixture]
Expand Down
2 changes: 1 addition & 1 deletion model/sample.vvm/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"predict_duration_filename": "predict_duration.onnx",
"predict_intonation_filename": "predict_intonation.onnx",
"decode_filename": "decode.onnx",
"style_id_to_model_inner_id": {
"style_id_to_inner_voice_id": {
"302": 2,
"303": 3
}
Expand Down

0 comments on commit 9c3a94e

Please sign in to comment.