diff --git a/crates/voicevox_core/src/__internal/interop.rs b/crates/voicevox_core/src/__internal/interop.rs index d76df9d88..c8cd7101f 100644 --- a/crates/voicevox_core/src/__internal/interop.rs +++ b/crates/voicevox_core/src/__internal/interop.rs @@ -1 +1 @@ -pub use crate::synthesizer::PerformInference; +pub use crate::synthesizer::blocking::PerformInference; diff --git a/crates/voicevox_core/src/engine/open_jtalk.rs b/crates/voicevox_core/src/engine/open_jtalk.rs index e81db51ee..ad1f8c19e 100644 --- a/crates/voicevox_core/src/engine/open_jtalk.rs +++ b/crates/voicevox_core/src/engine/open_jtalk.rs @@ -1,15 +1,4 @@ -use std::io::Write; -use std::sync::Arc; -use std::{path::Path, sync::Mutex}; - -use anyhow::anyhow; -use tempfile::NamedTempFile; - -use ::open_jtalk::{ - mecab_dict_index, text2mecab, JpCommon, ManagedResource, Mecab, Njd, Text2MecabError, -}; - -use crate::error::ErrorRepr; +use ::open_jtalk::Text2MecabError; #[derive(thiserror::Error, Debug)] #[error("`{function}`の実行が失敗しました")] @@ -19,199 +8,218 @@ pub(crate) struct OpenjtalkFunctionError { source: Option, } -struct Resources { - mecab: ManagedResource, - njd: ManagedResource, - jpcommon: ManagedResource, +pub trait FullcontextExtractor: Clone + Send + Sync + 'static { + fn extract_fullcontext(&self, text: &str) -> anyhow::Result>; } -#[allow(unsafe_code)] -unsafe impl Send for Resources {} - -impl self::blocking::OpenJtalk { - pub fn new(open_jtalk_dict_dir: impl AsRef) -> crate::result::Result { - let dict_dir = open_jtalk_dict_dir - .as_ref() - .to_str() - .unwrap_or_else(|| todo!()) // FIXME: `camino::Utf8Path`を要求するようにする - .to_owned(); - - // FIXME: この`{}`はGitのdiffを抑えるためだけに存在 - { - let mut resources = Resources { - mecab: ManagedResource::initialize(), - njd: ManagedResource::initialize(), - jpcommon: ManagedResource::initialize(), - }; +pub(crate) mod blocking { + use std::{ + io::Write as _, + path::Path, + sync::{Arc, Mutex}, + }; - let result = resources.mecab.load(&*dict_dir); - if !result { - // FIXME: 「システム辞書を読もうとしたけど読めなかった」というエラーをちゃんと用意する - return Err(ErrorRepr::NotLoadedOpenjtalkDict.into()); - } + use anyhow::anyhow; + use open_jtalk::{mecab_dict_index, text2mecab, JpCommon, ManagedResource, Mecab, Njd}; + use tempfile::NamedTempFile; - Ok(Self(Arc::new(self::blocking::Inner { - resources: Mutex::new(resources), - dict_dir, - }))) - } - } + use crate::error::ErrorRepr; - /// ユーザー辞書を設定する。 - /// - /// この関数を呼び出した後にユーザー辞書を変更した場合は、再度この関数を呼ぶ必要がある。 - pub fn use_user_dict( - &self, - user_dict: &crate::blocking::UserDict, - ) -> crate::result::Result<()> { - let words = &user_dict.to_mecab_format(); - self.0.use_user_dict(words) - } -} + use super::{FullcontextExtractor, OpenjtalkFunctionError}; -impl self::tokio::OpenJtalk { - pub async fn new(open_jtalk_dict_dir: impl AsRef) -> crate::result::Result { - let open_jtalk_dict_dir = open_jtalk_dict_dir.as_ref().to_owned(); - let blocking = - crate::task::asyncify(|| self::blocking::OpenJtalk::new(open_jtalk_dict_dir)).await?; - Ok(Self(blocking)) - } + /// テキスト解析器としてのOpen JTalk。 + #[derive(Clone)] + pub struct OpenJtalk(pub(super) Arc); + + impl self::OpenJtalk { + pub fn new(open_jtalk_dict_dir: impl AsRef) -> crate::result::Result { + let dict_dir = open_jtalk_dict_dir + .as_ref() + .to_str() + .unwrap_or_else(|| todo!()) // FIXME: `camino::Utf8Path`を要求するようにする + .to_owned(); + + // FIXME: この`{}`はGitのdiffを抑えるためだけに存在 + { + let mut resources = Resources { + mecab: ManagedResource::initialize(), + njd: ManagedResource::initialize(), + jpcommon: ManagedResource::initialize(), + }; + + let result = resources.mecab.load(&*dict_dir); + if !result { + // FIXME: 「システム辞書を読もうとしたけど読めなかった」というエラーをちゃんと用意する + return Err(ErrorRepr::NotLoadedOpenjtalkDict.into()); + } + + Ok(Self(Arc::new(Inner { + resources: Mutex::new(resources), + dict_dir, + }))) + } + } - /// ユーザー辞書を設定する。 - /// - /// この関数を呼び出した後にユーザー辞書を変更した場合は、再度この関数を呼ぶ必要がある。 - pub async fn use_user_dict( - &self, - user_dict: &crate::tokio::UserDict, - ) -> crate::result::Result<()> { - let inner = self.0 .0.clone(); - let words = user_dict.to_mecab_format(); - crate::task::asyncify(move || inner.use_user_dict(&words)).await + /// ユーザー辞書を設定する。 + /// + /// この関数を呼び出した後にユーザー辞書を変更した場合は、再度この関数を呼ぶ必要がある。 + pub fn use_user_dict( + &self, + user_dict: &crate::blocking::UserDict, + ) -> crate::result::Result<()> { + let words = &user_dict.to_mecab_format(); + self.0.use_user_dict(words) + } } -} -impl self::blocking::Inner { - // FIXME: 中断可能にする - fn use_user_dict(&self, words: &str) -> crate::result::Result<()> { - let result = { - // ユーザー辞書用のcsvを作成 - let mut temp_csv = - NamedTempFile::new().map_err(|e| ErrorRepr::UseUserDict(e.into()))?; - temp_csv - .write_all(words.as_ref()) - .map_err(|e| ErrorRepr::UseUserDict(e.into()))?; - let temp_csv_path = temp_csv.into_temp_path(); - let temp_dict = NamedTempFile::new().map_err(|e| ErrorRepr::UseUserDict(e.into()))?; - let temp_dict_path = temp_dict.into_temp_path(); - - // Mecabでユーザー辞書をコンパイル - // TODO: エラー(SEGV)が出るパターンを把握し、それをRust側で防ぐ。 - mecab_dict_index(&[ - "mecab-dict-index", - "-d", - &self.dict_dir, - "-u", - temp_dict_path.to_str().unwrap(), - "-f", - "utf-8", - "-t", - "utf-8", - temp_csv_path.to_str().unwrap(), - "-q", - ]); - - let Resources { mecab, .. } = &mut *self.resources.lock().unwrap(); - - mecab.load_with_userdic(self.dict_dir.as_ref(), Some(Path::new(&temp_dict_path))) - }; - - if !result { - return Err(ErrorRepr::UseUserDict(anyhow!("辞書のコンパイルに失敗しました")).into()); + impl FullcontextExtractor for self::OpenJtalk { + fn extract_fullcontext(&self, text: &str) -> anyhow::Result> { + let Resources { + mecab, + njd, + jpcommon, + } = &mut *self.0.resources.lock().unwrap(); + + jpcommon.refresh(); + njd.refresh(); + mecab.refresh(); + + let mecab_text = text2mecab(text).map_err(|e| OpenjtalkFunctionError { + function: "text2mecab", + source: Some(e), + })?; + if mecab.analysis(mecab_text) { + njd.mecab2njd( + mecab.get_feature().ok_or(OpenjtalkFunctionError { + function: "Mecab_get_feature", + source: None, + })?, + mecab.get_size(), + ); + njd.set_pronunciation(); + njd.set_digit(); + njd.set_accent_phrase(); + njd.set_accent_type(); + njd.set_unvoiced_vowel(); + njd.set_long_vowel(); + jpcommon.njd2jpcommon(njd); + jpcommon.make_label(); + jpcommon + .get_label_feature_to_iter() + .ok_or(OpenjtalkFunctionError { + function: "JPCommon_get_label_feature", + source: None, + }) + .map(|iter| iter.map(|s| s.to_string()).collect()) + .map_err(Into::into) + } else { + Err(OpenjtalkFunctionError { + function: "Mecab_analysis", + source: None, + } + .into()) + } } + } - Ok(()) + pub(super) struct Inner { + resources: std::sync::Mutex, + dict_dir: String, // FIXME: `camino::Utf8PathBuf`にする } -} -pub trait FullcontextExtractor: Clone + Send + Sync + 'static { - fn extract_fullcontext(&self, text: &str) -> anyhow::Result>; -} + impl Inner { + // FIXME: 中断可能にする + pub(super) fn use_user_dict(&self, words: &str) -> crate::result::Result<()> { + let result = { + // ユーザー辞書用のcsvを作成 + let mut temp_csv = + NamedTempFile::new().map_err(|e| ErrorRepr::UseUserDict(e.into()))?; + temp_csv + .write_all(words.as_ref()) + .map_err(|e| ErrorRepr::UseUserDict(e.into()))?; + let temp_csv_path = temp_csv.into_temp_path(); + let temp_dict = + NamedTempFile::new().map_err(|e| ErrorRepr::UseUserDict(e.into()))?; + let temp_dict_path = temp_dict.into_temp_path(); + + // Mecabでユーザー辞書をコンパイル + // TODO: エラー(SEGV)が出るパターンを把握し、それをRust側で防ぐ。 + mecab_dict_index(&[ + "mecab-dict-index", + "-d", + &self.dict_dir, + "-u", + temp_dict_path.to_str().unwrap(), + "-f", + "utf-8", + "-t", + "utf-8", + temp_csv_path.to_str().unwrap(), + "-q", + ]); + + let Resources { mecab, .. } = &mut *self.resources.lock().unwrap(); + + mecab.load_with_userdic(self.dict_dir.as_ref(), Some(Path::new(&temp_dict_path))) + }; -impl FullcontextExtractor for self::blocking::OpenJtalk { - fn extract_fullcontext(&self, text: &str) -> anyhow::Result> { - let Resources { - mecab, - njd, - jpcommon, - } = &mut *self.0.resources.lock().unwrap(); - - jpcommon.refresh(); - njd.refresh(); - mecab.refresh(); - - let mecab_text = text2mecab(text).map_err(|e| OpenjtalkFunctionError { - function: "text2mecab", - source: Some(e), - })?; - if mecab.analysis(mecab_text) { - njd.mecab2njd( - mecab.get_feature().ok_or(OpenjtalkFunctionError { - function: "Mecab_get_feature", - source: None, - })?, - mecab.get_size(), - ); - njd.set_pronunciation(); - njd.set_digit(); - njd.set_accent_phrase(); - njd.set_accent_type(); - njd.set_unvoiced_vowel(); - njd.set_long_vowel(); - jpcommon.njd2jpcommon(njd); - jpcommon.make_label(); - jpcommon - .get_label_feature_to_iter() - .ok_or(OpenjtalkFunctionError { - function: "JPCommon_get_label_feature", - source: None, - }) - .map(|iter| iter.map(|s| s.to_string()).collect()) - .map_err(Into::into) - } else { - Err(OpenjtalkFunctionError { - function: "Mecab_analysis", - source: None, + if !result { + return Err( + ErrorRepr::UseUserDict(anyhow!("辞書のコンパイルに失敗しました")).into(), + ); } - .into()) + + Ok(()) } } -} -impl FullcontextExtractor for self::tokio::OpenJtalk { - fn extract_fullcontext(&self, text: &str) -> anyhow::Result> { - self.0.extract_fullcontext(text) + struct Resources { + mecab: ManagedResource, + njd: ManagedResource, + jpcommon: ManagedResource, } + + // FIXME: open_jtalk-rs側で宣言する + #[allow(unsafe_code)] + unsafe impl Send for Resources {} } -pub(crate) mod blocking { - use std::sync::Arc; +pub(crate) mod tokio { + use std::path::Path; - use super::Resources; + use super::FullcontextExtractor; /// テキスト解析器としてのOpen JTalk。 #[derive(Clone)] - pub struct OpenJtalk(pub(super) Arc); + pub struct OpenJtalk(super::blocking::OpenJtalk); + + impl self::OpenJtalk { + pub async fn new(open_jtalk_dict_dir: impl AsRef) -> crate::result::Result { + let open_jtalk_dict_dir = open_jtalk_dict_dir.as_ref().to_owned(); + let blocking = + crate::task::asyncify(|| super::blocking::OpenJtalk::new(open_jtalk_dict_dir)) + .await?; + Ok(Self(blocking)) + } - pub(super) struct Inner { - pub(super) resources: std::sync::Mutex, - pub(super) dict_dir: String, // FIXME: `camino::Utf8PathBuf`にする + /// ユーザー辞書を設定する。 + /// + /// この関数を呼び出した後にユーザー辞書を変更した場合は、再度この関数を呼ぶ必要がある。 + pub async fn use_user_dict( + &self, + user_dict: &crate::tokio::UserDict, + ) -> crate::result::Result<()> { + let inner = self.0 .0.clone(); + let words = user_dict.to_mecab_format(); + crate::task::asyncify(move || inner.use_user_dict(&words)).await + } } -} -pub(crate) mod tokio { - /// テキスト解析器としてのOpen JTalk。 - #[derive(Clone)] - pub struct OpenJtalk(pub(super) super::blocking::OpenJtalk); + impl FullcontextExtractor for self::OpenJtalk { + fn extract_fullcontext(&self, text: &str) -> anyhow::Result> { + self.0.extract_fullcontext(text) + } + } } #[cfg(test)] diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index b2dd02764..202e917c7 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -1,25 +1,4 @@ -use std::io::{Cursor, Write as _}; - -use enum_map::enum_map; - -use crate::{ - engine::{ - self, create_kana, parse_kana, AccentPhraseModel, FullcontextExtractor, MoraModel, - OjtPhoneme, Utterance, - }, - error::ErrorRepr, - infer::{ - domain::{ - DecodeInput, DecodeOutput, InferenceOperationImpl, PredictDurationInput, - PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, - }, - runtimes::Onnxruntime, - status::Status, - InferenceSessionOptions, - }, - numerics::F32Ext as _, - AudioQueryModel, Result, StyleId, SupportedDevices, VoiceModelId, VoiceModelMeta, -}; +use crate::infer::runtimes::Onnxruntime; /// [`blocking::Synthesizer::synthesis`]および[`tokio::Synthesizer::synthesis`]のオプション。 /// @@ -89,1290 +68,1322 @@ pub struct InitializeOptions { pub cpu_num_threads: u16, } -const DEFAULT_SAMPLING_RATE: u32 = 24000; - pub(crate) type InferenceRuntimeImpl = Onnxruntime; -// FIXME: docを書く -impl self::tokio::Synthesizer { - pub fn new(open_jtalk: O, options: &InitializeOptions) -> Result { - self::blocking::Synthesizer::new(open_jtalk, options) - .map(Into::into) - .map(Self) - } - - pub fn is_gpu_mode(&self) -> bool { - self.0.is_gpu_mode() - } - - pub async fn load_voice_model(&self, model: &crate::tokio::VoiceModel) -> Result<()> { - let model_bytes = &model.read_inference_models().await?; - self.0.status.insert_model(model.header(), model_bytes) - } - - pub fn unload_voice_model(&self, voice_model_id: &VoiceModelId) -> Result<()> { - self.0.unload_voice_model(voice_model_id) - } - - pub fn is_loaded_voice_model(&self, voice_model_id: &VoiceModelId) -> bool { - self.0.is_loaded_voice_model(voice_model_id) - } - - #[doc(hidden)] - pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { - self.0.is_loaded_model_by_style_id(style_id) - } - - pub fn metas(&self) -> VoiceModelMeta { - self.0.metas() - } - - pub async fn synthesis( - &self, - audio_query: &AudioQueryModel, - style_id: StyleId, - options: &SynthesisOptions, - ) -> Result> { - let blocking = self.0.clone(); - let audio_query = audio_query.clone(); - let options = options.clone(); - - crate::task::asyncify(move || blocking.synthesis(&audio_query, style_id, &options)).await - } - - pub async fn create_accent_phrases_from_kana( - &self, - kana: &str, - style_id: StyleId, - ) -> Result> { - let blocking = self.0.clone(); - let kana = kana.to_owned(); - - crate::task::asyncify(move || blocking.create_accent_phrases_from_kana(&kana, style_id)) - .await - } - - pub async fn replace_mora_data( - &self, - accent_phrases: &[AccentPhraseModel], - style_id: StyleId, - ) -> Result> { - let blocking = self.0.clone(); - let accent_phrases = accent_phrases.to_owned(); - - crate::task::asyncify(move || blocking.replace_mora_data(&accent_phrases, style_id)).await - } - - pub async fn replace_phoneme_length( - &self, - accent_phrases: &[AccentPhraseModel], - style_id: StyleId, - ) -> Result> { - let blocking = self.0.clone(); - let accent_phrases = accent_phrases.to_owned(); +pub(crate) mod blocking { + // FIXME: ここのdocのコードブロックはasync版のものなので、`tokio`モジュールの方に移した上で、 + // (ブロッキング版をpublic APIにするならの話ではあるが)ブロッキング版はブロッキング版でコード例 + // を用意する - crate::task::asyncify(move || blocking.replace_phoneme_length(&accent_phrases, style_id)) - .await - } + use std::io::{Cursor, Write as _}; - pub async fn replace_mora_pitch( - &self, - accent_phrases: &[AccentPhraseModel], - style_id: StyleId, - ) -> Result> { - let blocking = self.0.clone(); - let accent_phrases = accent_phrases.to_owned(); + use enum_map::enum_map; - crate::task::asyncify(move || blocking.replace_mora_pitch(&accent_phrases, style_id)).await - } + use crate::{ + engine::{self, create_kana, parse_kana, MoraModel, OjtPhoneme, Utterance}, + error::ErrorRepr, + infer::{ + domain::{ + DecodeInput, DecodeOutput, InferenceDomainImpl, InferenceOperationImpl, + PredictDurationInput, PredictDurationOutput, PredictIntonationInput, + PredictIntonationOutput, + }, + status::Status, + InferenceSessionOptions, + }, + numerics::F32Ext as _, + AccentPhraseModel, AudioQueryModel, FullcontextExtractor, Result, StyleId, + SupportedDevices, SynthesisOptions, VoiceModelId, VoiceModelMeta, + }; - pub async fn audio_query_from_kana( - &self, - kana: &str, - style_id: StyleId, - ) -> Result { - let blocking = self.0.clone(); - let kana = kana.to_owned(); + use super::{AccelerationMode, InferenceRuntimeImpl, InitializeOptions, TtsOptions}; - crate::task::asyncify(move || blocking.audio_query_from_kana(&kana, style_id)).await - } + const DEFAULT_SAMPLING_RATE: u32 = 24000; - pub async fn tts_from_kana( - &self, - kana: &str, - style_id: StyleId, - options: &TtsOptions, - ) -> Result> { - let blocking = self.0.clone(); - let kana = kana.to_owned(); - let options = options.clone(); - - crate::task::asyncify(move || blocking.tts_from_kana(&kana, style_id, &options)).await - } -} + /// 音声シンセサイザ。 + pub struct Synthesizer { + pub(super) status: Status, + open_jtalk: O, + use_gpu: bool, + } + + impl self::Synthesizer { + /// `Synthesizer`をコンストラクトする。 + /// + /// # Example + /// + #[cfg_attr(windows, doc = "```no_run")] // https://github.com/VOICEVOX/voicevox_core/issues/537 + #[cfg_attr(not(windows), doc = "```")] + /// # #[tokio::main] + /// # async fn main() -> anyhow::Result<()> { + /// # use test_util::OPEN_JTALK_DIC_DIR; + /// # + /// # const ACCELERATION_MODE: AccelerationMode = AccelerationMode::Cpu; + /// # + /// use std::sync::Arc; + /// + /// use voicevox_core::{ + /// tokio::{OpenJtalk, Synthesizer}, + /// AccelerationMode, InitializeOptions, + /// }; + /// + /// let mut syntesizer = Synthesizer::new( + /// Arc::new(OpenJtalk::new(OPEN_JTALK_DIC_DIR).await.unwrap()), + /// &InitializeOptions { + /// acceleration_mode: ACCELERATION_MODE, + /// ..Default::default() + /// }, + /// )?; + /// # + /// # Ok(()) + /// # } + /// ``` + pub fn new(open_jtalk: O, options: &InitializeOptions) -> Result { + #[cfg(windows)] + list_windows_video_cards(); + + let use_gpu = match options.acceleration_mode { + AccelerationMode::Auto => { + let supported_devices = SupportedDevices::create()?; + + if cfg!(feature = "directml") { + *supported_devices.dml() + } else { + *supported_devices.cuda() + } + } + AccelerationMode::Cpu => false, + AccelerationMode::Gpu => true, + }; -impl self::tokio::Synthesizer { - pub async fn create_accent_phrases( - &self, - text: &str, - style_id: StyleId, - ) -> Result> { - let blocking = self.0.clone(); - let text = text.to_owned(); + if use_gpu && !can_support_gpu_feature()? { + return Err(ErrorRepr::GpuSupport.into()); + } - crate::task::asyncify(move || blocking.create_accent_phrases(&text, style_id)).await - } + // 軽いモデルはこちらを使う + let light_session_options = + InferenceSessionOptions::new(options.cpu_num_threads, false); - pub async fn audio_query(&self, text: &str, style_id: StyleId) -> Result { - let blocking = self.0.clone(); - let text = text.to_owned(); + // 重いモデルはこちらを使う + let heavy_session_options = + InferenceSessionOptions::new(options.cpu_num_threads, use_gpu); - crate::task::asyncify(move || blocking.audio_query(&text, style_id)).await - } + let status = Status::new(enum_map! { + InferenceOperationImpl::PredictDuration + | InferenceOperationImpl::PredictIntonation => light_session_options, + InferenceOperationImpl::Decode => heavy_session_options, + }); - pub async fn tts( - &self, - text: &str, - style_id: StyleId, - options: &TtsOptions, - ) -> Result> { - let blocking = self.0.clone(); - let text = text.to_owned(); - let options = options.clone(); - - crate::task::asyncify(move || blocking.tts(&text, style_id, &options)).await - } -} + return Ok(Self { + status, + open_jtalk, + use_gpu, + }); -// FIXME: コードのdiffを抑えるため`impl blocking::Synthesizer`と -// `impl blocking::Synthesizer`がそれぞれ3つ誕生しているので、一つずつにまとめる - -// FIXME: ここのdocのコードブロックはasync版のものなので、↑の方に移した上で、(ブロッキング版を -// public APIにするならの話ではあるが)ブロッキング版はブロッキング版でコード例を用意する -impl self::blocking::Synthesizer { - /// `Synthesizer`をコンストラクトする。 - /// - /// # Example - /// - #[cfg_attr(windows, doc = "```no_run")] // https://github.com/VOICEVOX/voicevox_core/issues/537 - #[cfg_attr(not(windows), doc = "```")] - /// # #[tokio::main] - /// # async fn main() -> anyhow::Result<()> { - /// # use test_util::OPEN_JTALK_DIC_DIR; - /// # - /// # const ACCELERATION_MODE: AccelerationMode = AccelerationMode::Cpu; - /// # - /// use std::sync::Arc; - /// - /// use voicevox_core::{ - /// tokio::{OpenJtalk, Synthesizer}, - /// AccelerationMode, InitializeOptions, - /// }; - /// - /// let mut syntesizer = Synthesizer::new( - /// Arc::new(OpenJtalk::new(OPEN_JTALK_DIC_DIR).await.unwrap()), - /// &InitializeOptions { - /// acceleration_mode: ACCELERATION_MODE, - /// ..Default::default() - /// }, - /// )?; - /// # - /// # Ok(()) - /// # } - /// ``` - pub fn new(open_jtalk: O, options: &InitializeOptions) -> Result { - #[cfg(windows)] - list_windows_video_cards(); - - let use_gpu = match options.acceleration_mode { - AccelerationMode::Auto => { + fn can_support_gpu_feature() -> Result { let supported_devices = SupportedDevices::create()?; if cfg!(feature = "directml") { - *supported_devices.dml() + Ok(*supported_devices.dml()) } else { - *supported_devices.cuda() + Ok(*supported_devices.cuda()) } } - AccelerationMode::Cpu => false, - AccelerationMode::Gpu => true, - }; - - if use_gpu && !can_support_gpu_feature()? { - return Err(ErrorRepr::GpuSupport.into()); } - // 軽いモデルはこちらを使う - let light_session_options = InferenceSessionOptions::new(options.cpu_num_threads, false); - - // 重いモデルはこちらを使う - let heavy_session_options = InferenceSessionOptions::new(options.cpu_num_threads, use_gpu); - - let status = Status::new(enum_map! { - InferenceOperationImpl::PredictDuration - | InferenceOperationImpl::PredictIntonation => light_session_options, - InferenceOperationImpl::Decode => heavy_session_options, - }); - - return Ok(Self { - status, - open_jtalk, - use_gpu, - }); + /// ハードウェアアクセラレーションがGPUモードか判定する。 + pub fn is_gpu_mode(&self) -> bool { + self.use_gpu + } - fn can_support_gpu_feature() -> Result { - let supported_devices = SupportedDevices::create()?; + /// 音声モデルを読み込む。 + pub fn load_voice_model(&self, model: &crate::blocking::VoiceModel) -> Result<()> { + let model_bytes = &model.read_inference_models()?; + self.status.insert_model(model.header(), model_bytes) + } - if cfg!(feature = "directml") { - Ok(*supported_devices.dml()) - } else { - Ok(*supported_devices.cuda()) - } + /// 音声モデルの読み込みを解除する。 + pub fn unload_voice_model(&self, voice_model_id: &VoiceModelId) -> Result<()> { + self.status.unload_model(voice_model_id) } - } - /// ハードウェアアクセラレーションがGPUモードか判定する。 - pub fn is_gpu_mode(&self) -> bool { - self.use_gpu - } + /// 指定したIDの音声モデルが読み込まれているか判定する。 + pub fn is_loaded_voice_model(&self, voice_model_id: &VoiceModelId) -> bool { + self.status.is_loaded_model(voice_model_id) + } - /// 音声モデルを読み込む。 - pub fn load_voice_model(&self, model: &crate::blocking::VoiceModel) -> Result<()> { - let model_bytes = &model.read_inference_models()?; - self.status.insert_model(model.header(), model_bytes) - } + #[doc(hidden)] + pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { + self.status.is_loaded_model_by_style_id(style_id) + } - /// 音声モデルの読み込みを解除する。 - pub fn unload_voice_model(&self, voice_model_id: &VoiceModelId) -> Result<()> { - self.status.unload_model(voice_model_id) - } + /// 今読み込んでいる音声モデルのメタ情報を返す。 + pub fn metas(&self) -> VoiceModelMeta { + self.status.metas() + } - /// 指定したIDの音声モデルが読み込まれているか判定する。 - pub fn is_loaded_voice_model(&self, voice_model_id: &VoiceModelId) -> bool { - self.status.is_loaded_model(voice_model_id) - } + /// AudioQueryから音声合成を行う。 + pub fn synthesis( + &self, + audio_query: &AudioQueryModel, + style_id: StyleId, + options: &SynthesisOptions, + ) -> Result> { + let speed_scale = *audio_query.speed_scale(); + let pitch_scale = *audio_query.pitch_scale(); + let intonation_scale = *audio_query.intonation_scale(); + let pre_phoneme_length = *audio_query.pre_phoneme_length(); + let post_phoneme_length = *audio_query.post_phoneme_length(); + + let accent_phrases = if options.enable_interrogative_upspeak { + adjust_interrogative_accent_phrases(audio_query.accent_phrases().as_slice()) + } else { + audio_query.accent_phrases().clone() + }; - #[doc(hidden)] - pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { - self.status.is_loaded_model_by_style_id(style_id) - } + let (flatten_moras, phoneme_data_list) = initial_process(&accent_phrases); - /// 今読み込んでいる音声モデルのメタ情報を返す。 - pub fn metas(&self) -> VoiceModelMeta { - self.status.metas() - } + let mut phoneme_length_list = vec![pre_phoneme_length]; + let mut f0_list = vec![0.]; + let mut voiced_list = vec![false]; + { + let mut sum_of_f0_bigger_than_zero = 0.; + let mut count_of_f0_bigger_than_zero = 0; - /// AudioQueryから音声合成を行う。 - pub fn synthesis( - &self, - audio_query: &AudioQueryModel, - style_id: StyleId, - options: &SynthesisOptions, - ) -> Result> { - let speed_scale = *audio_query.speed_scale(); - let pitch_scale = *audio_query.pitch_scale(); - let intonation_scale = *audio_query.intonation_scale(); - let pre_phoneme_length = *audio_query.pre_phoneme_length(); - let post_phoneme_length = *audio_query.post_phoneme_length(); - - let accent_phrases = if options.enable_interrogative_upspeak { - adjust_interrogative_accent_phrases(audio_query.accent_phrases().as_slice()) - } else { - audio_query.accent_phrases().clone() - }; + for mora in flatten_moras { + let consonant_length = *mora.consonant_length(); + let vowel_length = *mora.vowel_length(); + let pitch = *mora.pitch(); - let (flatten_moras, phoneme_data_list) = initial_process(&accent_phrases); + if let Some(consonant_length) = consonant_length { + phoneme_length_list.push(consonant_length); + } + phoneme_length_list.push(vowel_length); - let mut phoneme_length_list = vec![pre_phoneme_length]; - let mut f0_list = vec![0.]; - let mut voiced_list = vec![false]; - { - let mut sum_of_f0_bigger_than_zero = 0.; - let mut count_of_f0_bigger_than_zero = 0; + let f0_single = pitch * 2.0_f32.powf(pitch_scale); + f0_list.push(f0_single); - for mora in flatten_moras { - let consonant_length = *mora.consonant_length(); - let vowel_length = *mora.vowel_length(); - let pitch = *mora.pitch(); + let bigger_than_zero = f0_single > 0.; + voiced_list.push(bigger_than_zero); - if let Some(consonant_length) = consonant_length { - phoneme_length_list.push(consonant_length); + if bigger_than_zero { + sum_of_f0_bigger_than_zero += f0_single; + count_of_f0_bigger_than_zero += 1; + } } - phoneme_length_list.push(vowel_length); - - let f0_single = pitch * 2.0_f32.powf(pitch_scale); - f0_list.push(f0_single); + phoneme_length_list.push(post_phoneme_length); + f0_list.push(0.); + voiced_list.push(false); + let mean_f0 = sum_of_f0_bigger_than_zero / (count_of_f0_bigger_than_zero as f32); + + if !mean_f0.is_nan() { + for i in 0..f0_list.len() { + if voiced_list[i] { + f0_list[i] = (f0_list[i] - mean_f0) * intonation_scale + mean_f0; + } + } + } + } - let bigger_than_zero = f0_single > 0.; - voiced_list.push(bigger_than_zero); + let (_, _, vowel_indexes) = split_mora(&phoneme_data_list); - if bigger_than_zero { - sum_of_f0_bigger_than_zero += f0_single; - count_of_f0_bigger_than_zero += 1; + let mut phoneme: Vec> = Vec::new(); + let mut f0: Vec = Vec::new(); + { + const RATE: f32 = 24000. / 256.; + let mut sum_of_phoneme_length = 0; + let mut count_of_f0 = 0; + let mut vowel_indexes_index = 0; + + for (i, phoneme_length) in phoneme_length_list.iter().enumerate() { + // VOICEVOX ENGINEと挙動を合わせるため、四捨五入ではなく偶数丸めをする + // + // https://github.com/VOICEVOX/voicevox_engine/issues/552 + let phoneme_length = ((*phoneme_length * RATE).round_ties_even_() / speed_scale) + .round_ties_even_() as usize; + let phoneme_id = phoneme_data_list[i].phoneme_id(); + + for _ in 0..phoneme_length { + let mut phonemes_vec = vec![0.; OjtPhoneme::num_phoneme()]; + phonemes_vec[phoneme_id as usize] = 1.; + phoneme.push(phonemes_vec) + } + sum_of_phoneme_length += phoneme_length; + + if i as i64 == vowel_indexes[vowel_indexes_index] { + for _ in 0..sum_of_phoneme_length { + f0.push(f0_list[count_of_f0]); + } + count_of_f0 += 1; + sum_of_phoneme_length = 0; + vowel_indexes_index += 1; + } } } - phoneme_length_list.push(post_phoneme_length); - f0_list.push(0.); - voiced_list.push(false); - let mean_f0 = sum_of_f0_bigger_than_zero / (count_of_f0_bigger_than_zero as f32); - - if !mean_f0.is_nan() { - for i in 0..f0_list.len() { - if voiced_list[i] { - f0_list[i] = (f0_list[i] - mean_f0) * intonation_scale + mean_f0; + + // 2次元のvectorを1次元に変換し、アドレスを連続させる + let flatten_phoneme = phoneme.into_iter().flatten().collect::>(); + + let wave = &self.decode( + f0.len(), + OjtPhoneme::num_phoneme(), + &f0, + &flatten_phoneme, + style_id, + )?; + return Ok(to_wav(wave, audio_query)); + + fn adjust_interrogative_accent_phrases( + accent_phrases: &[AccentPhraseModel], + ) -> Vec { + accent_phrases + .iter() + .map(|accent_phrase| { + AccentPhraseModel::new( + adjust_interrogative_moras(accent_phrase), + *accent_phrase.accent(), + accent_phrase.pause_mora().clone(), + *accent_phrase.is_interrogative(), + ) + }) + .collect() + } + + fn adjust_interrogative_moras(accent_phrase: &AccentPhraseModel) -> Vec { + let moras = accent_phrase.moras(); + if *accent_phrase.is_interrogative() && !moras.is_empty() { + let last_mora = moras.last().unwrap(); + let last_mora_pitch = *last_mora.pitch(); + if last_mora_pitch != 0.0 { + let mut new_moras: Vec = Vec::with_capacity(moras.len() + 1); + new_moras.extend_from_slice(moras.as_slice()); + let interrogative_mora = make_interrogative_mora(last_mora); + new_moras.push(interrogative_mora); + return new_moras; } } + moras.clone() } - } - let (_, _, vowel_indexes) = split_mora(&phoneme_data_list); + fn make_interrogative_mora(last_mora: &MoraModel) -> MoraModel { + const FIX_VOWEL_LENGTH: f32 = 0.15; + const ADJUST_PITCH: f32 = 0.3; + const MAX_PITCH: f32 = 6.5; - let mut phoneme: Vec> = Vec::new(); - let mut f0: Vec = Vec::new(); - { - const RATE: f32 = 24000. / 256.; - let mut sum_of_phoneme_length = 0; - let mut count_of_f0 = 0; - let mut vowel_indexes_index = 0; - - for (i, phoneme_length) in phoneme_length_list.iter().enumerate() { - // VOICEVOX ENGINEと挙動を合わせるため、四捨五入ではなく偶数丸めをする - // - // https://github.com/VOICEVOX/voicevox_engine/issues/552 - let phoneme_length = ((*phoneme_length * RATE).round_ties_even_() / speed_scale) - .round_ties_even_() as usize; - let phoneme_id = phoneme_data_list[i].phoneme_id(); - - for _ in 0..phoneme_length { - let mut phonemes_vec = vec![0.; OjtPhoneme::num_phoneme()]; - phonemes_vec[phoneme_id as usize] = 1.; - phoneme.push(phonemes_vec) - } - sum_of_phoneme_length += phoneme_length; + let pitch = (*last_mora.pitch() + ADJUST_PITCH).min(MAX_PITCH); - if i as i64 == vowel_indexes[vowel_indexes_index] { - for _ in 0..sum_of_phoneme_length { - f0.push(f0_list[count_of_f0]); + MoraModel::new( + mora_to_text(last_mora.vowel()), + None, + None, + last_mora.vowel().clone(), + FIX_VOWEL_LENGTH, + pitch, + ) + } + + fn to_wav(wave: &[f32], audio_query: &AudioQueryModel) -> Vec { + let volume_scale = *audio_query.volume_scale(); + let output_stereo = *audio_query.output_stereo(); + let output_sampling_rate = *audio_query.output_sampling_rate(); + + // TODO: 44.1kHzなどの対応 + + let num_channels: u16 = if output_stereo { 2 } else { 1 }; + let bit_depth: u16 = 16; + let repeat_count: u32 = + (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32; + let block_size: u16 = bit_depth * num_channels / 8; + + let bytes_size = wave.len() as u32 * repeat_count * 2; + let wave_size = bytes_size + 44; + + let buf: Vec = Vec::with_capacity(wave_size as usize); + let mut cur = Cursor::new(buf); + + cur.write_all("RIFF".as_bytes()).unwrap(); + cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap(); + cur.write_all("WAVEfmt ".as_bytes()).unwrap(); + cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length + cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM + cur.write_all(&num_channels.to_le_bytes()).unwrap(); + cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap(); + + let block_rate = output_sampling_rate * block_size as u32; + + cur.write_all(&block_rate.to_le_bytes()).unwrap(); + cur.write_all(&block_size.to_le_bytes()).unwrap(); + cur.write_all(&bit_depth.to_le_bytes()).unwrap(); + cur.write_all("data".as_bytes()).unwrap(); + cur.write_all(&bytes_size.to_le_bytes()).unwrap(); + + for value in wave { + let v = (value * volume_scale).clamp(-1., 1.); + let data = (v * 0x7fff as f32) as i16; + for _ in 0..repeat_count { + cur.write_all(&data.to_le_bytes()).unwrap(); } - count_of_f0 += 1; - sum_of_phoneme_length = 0; - vowel_indexes_index += 1; } + + cur.into_inner() } } - // 2次元のvectorを1次元に変換し、アドレスを連続させる - let flatten_phoneme = phoneme.into_iter().flatten().collect::>(); + /// AquesTalk風記法からAccentPhrase (アクセント句)の配列を生成する。 + /// + /// # Example + /// + #[cfg_attr(windows, doc = "```no_run")] // https://github.com/VOICEVOX/voicevox_core/issues/537 + #[cfg_attr(not(windows), doc = "```")] + /// # #[tokio::main] + /// # async fn main() -> anyhow::Result<()> { + /// # let synthesizer = + /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( + /// # test_util::OPEN_JTALK_DIC_DIR, + /// # ) + /// # .await?; + /// # + /// use voicevox_core::StyleId; + /// + /// let accent_phrases = synthesizer + /// .create_accent_phrases_from_kana("コンニチワ'", StyleId::new(302)) + /// .await?; + /// # + /// # Ok(()) + /// # } + /// ``` + pub fn create_accent_phrases_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> Result> { + self.replace_mora_data(&parse_kana(kana)?, style_id) + } - let wave = &self.decode( - f0.len(), - OjtPhoneme::num_phoneme(), - &f0, - &flatten_phoneme, - style_id, - )?; - return Ok(to_wav(wave, audio_query)); + /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。 + pub fn replace_mora_data( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let accent_phrases = self.replace_phoneme_length(accent_phrases, style_id)?; + self.replace_mora_pitch(&accent_phrases, style_id) + } - fn adjust_interrogative_accent_phrases( + /// AccentPhraseの配列の音素長を、特定の声で生成しなおす。 + pub fn replace_phoneme_length( + &self, accent_phrases: &[AccentPhraseModel], - ) -> Vec { - accent_phrases + style_id: StyleId, + ) -> Result> { + let (_, phoneme_data_list) = initial_process(accent_phrases); + + let (_, _, vowel_indexes_data) = split_mora(&phoneme_data_list); + + let phoneme_list_s: Vec = phoneme_data_list + .iter() + .map(|phoneme_data| phoneme_data.phoneme_id()) + .collect(); + let phoneme_length = self.predict_duration(&phoneme_list_s, style_id)?; + + let mut index = 0; + let new_accent_phrases = accent_phrases .iter() .map(|accent_phrase| { AccentPhraseModel::new( - adjust_interrogative_moras(accent_phrase), + accent_phrase + .moras() + .iter() + .map(|mora| { + let new_mora = MoraModel::new( + mora.text().clone(), + mora.consonant().clone(), + mora.consonant().as_ref().map(|_| { + phoneme_length[vowel_indexes_data[index + 1] as usize - 1] + }), + mora.vowel().clone(), + phoneme_length[vowel_indexes_data[index + 1] as usize], + *mora.pitch(), + ); + index += 1; + new_mora + }) + .collect(), *accent_phrase.accent(), - accent_phrase.pause_mora().clone(), + accent_phrase.pause_mora().as_ref().map(|pause_mora| { + let new_pause_mora = MoraModel::new( + pause_mora.text().clone(), + pause_mora.consonant().clone(), + *pause_mora.consonant_length(), + pause_mora.vowel().clone(), + phoneme_length[vowel_indexes_data[index + 1] as usize], + *pause_mora.pitch(), + ); + index += 1; + new_pause_mora + }), *accent_phrase.is_interrogative(), ) }) - .collect() + .collect(); + + Ok(new_accent_phrases) } - fn adjust_interrogative_moras(accent_phrase: &AccentPhraseModel) -> Vec { - let moras = accent_phrase.moras(); - if *accent_phrase.is_interrogative() && !moras.is_empty() { - let last_mora = moras.last().unwrap(); - let last_mora_pitch = *last_mora.pitch(); - if last_mora_pitch != 0.0 { - let mut new_moras: Vec = Vec::with_capacity(moras.len() + 1); - new_moras.extend_from_slice(moras.as_slice()); - let interrogative_mora = make_interrogative_mora(last_mora); - new_moras.push(interrogative_mora); - return new_moras; + /// AccentPhraseの配列の音高を、特定の声で生成しなおす。 + pub fn replace_mora_pitch( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let (_, phoneme_data_list) = initial_process(accent_phrases); + + let mut base_start_accent_list = vec![0]; + let mut base_end_accent_list = vec![0]; + let mut base_start_accent_phrase_list = vec![0]; + let mut base_end_accent_phrase_list = vec![0]; + for accent_phrase in accent_phrases { + let mut accent = usize::from(*accent_phrase.accent() != 1); + create_one_accent_list(&mut base_start_accent_list, accent_phrase, accent as i32); + + accent = *accent_phrase.accent() - 1; + create_one_accent_list(&mut base_end_accent_list, accent_phrase, accent as i32); + create_one_accent_list(&mut base_start_accent_phrase_list, accent_phrase, 0); + create_one_accent_list(&mut base_end_accent_phrase_list, accent_phrase, -1); + } + base_start_accent_list.push(0); + base_end_accent_list.push(0); + base_start_accent_phrase_list.push(0); + base_end_accent_phrase_list.push(0); + + let (consonant_phoneme_data_list, vowel_phoneme_data_list, vowel_indexes) = + split_mora(&phoneme_data_list); + + let consonant_phoneme_list: Vec = consonant_phoneme_data_list + .iter() + .map(|phoneme_data| phoneme_data.phoneme_id()) + .collect(); + let vowel_phoneme_list: Vec = vowel_phoneme_data_list + .iter() + .map(|phoneme_data| phoneme_data.phoneme_id()) + .collect(); + + let mut start_accent_list = Vec::with_capacity(vowel_indexes.len()); + let mut end_accent_list = Vec::with_capacity(vowel_indexes.len()); + let mut start_accent_phrase_list = Vec::with_capacity(vowel_indexes.len()); + let mut end_accent_phrase_list = Vec::with_capacity(vowel_indexes.len()); + + for vowel_index in vowel_indexes { + start_accent_list.push(base_start_accent_list[vowel_index as usize]); + end_accent_list.push(base_end_accent_list[vowel_index as usize]); + start_accent_phrase_list.push(base_start_accent_phrase_list[vowel_index as usize]); + end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]); + } + + let mut f0_list = self.predict_intonation( + vowel_phoneme_list.len(), + &vowel_phoneme_list, + &consonant_phoneme_list, + &start_accent_list, + &end_accent_list, + &start_accent_phrase_list, + &end_accent_phrase_list, + style_id, + )?; + + for i in 0..vowel_phoneme_data_list.len() { + const UNVOICED_MORA_PHONEME_LIST: &[&str] = &["A", "I", "U", "E", "O", "cl", "pau"]; + + if UNVOICED_MORA_PHONEME_LIST + .iter() + .any(|phoneme| *phoneme == vowel_phoneme_data_list[i].phoneme()) + { + f0_list[i] = 0.; } } - moras.clone() - } - fn make_interrogative_mora(last_mora: &MoraModel) -> MoraModel { - const FIX_VOWEL_LENGTH: f32 = 0.15; - const ADJUST_PITCH: f32 = 0.3; - const MAX_PITCH: f32 = 6.5; + let mut index = 0; + let new_accent_phrases = accent_phrases + .iter() + .map(|accent_phrase| { + AccentPhraseModel::new( + accent_phrase + .moras() + .iter() + .map(|mora| { + let new_mora = MoraModel::new( + mora.text().clone(), + mora.consonant().clone(), + *mora.consonant_length(), + mora.vowel().clone(), + *mora.vowel_length(), + f0_list[index + 1], + ); + index += 1; + new_mora + }) + .collect(), + *accent_phrase.accent(), + accent_phrase.pause_mora().as_ref().map(|pause_mora| { + let new_pause_mora = MoraModel::new( + pause_mora.text().clone(), + pause_mora.consonant().clone(), + *pause_mora.consonant_length(), + pause_mora.vowel().clone(), + *pause_mora.vowel_length(), + f0_list[index + 1], + ); + index += 1; + new_pause_mora + }), + *accent_phrase.is_interrogative(), + ) + }) + .collect(); - let pitch = (*last_mora.pitch() + ADJUST_PITCH).min(MAX_PITCH); + return Ok(new_accent_phrases); - MoraModel::new( - mora_to_text(last_mora.vowel()), - None, - None, - last_mora.vowel().clone(), - FIX_VOWEL_LENGTH, - pitch, - ) - } + fn create_one_accent_list( + accent_list: &mut Vec, + accent_phrase: &AccentPhraseModel, + point: i32, + ) { + let mut one_accent_list: Vec = Vec::new(); - fn to_wav(wave: &[f32], audio_query: &AudioQueryModel) -> Vec { - let volume_scale = *audio_query.volume_scale(); - let output_stereo = *audio_query.output_stereo(); - let output_sampling_rate = *audio_query.output_sampling_rate(); - - // TODO: 44.1kHzなどの対応 - - let num_channels: u16 = if output_stereo { 2 } else { 1 }; - let bit_depth: u16 = 16; - let repeat_count: u32 = - (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32; - let block_size: u16 = bit_depth * num_channels / 8; - - let bytes_size = wave.len() as u32 * repeat_count * 2; - let wave_size = bytes_size + 44; - - let buf: Vec = Vec::with_capacity(wave_size as usize); - let mut cur = Cursor::new(buf); - - cur.write_all("RIFF".as_bytes()).unwrap(); - cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap(); - cur.write_all("WAVEfmt ".as_bytes()).unwrap(); - cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length - cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM - cur.write_all(&num_channels.to_le_bytes()).unwrap(); - cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap(); - - let block_rate = output_sampling_rate * block_size as u32; - - cur.write_all(&block_rate.to_le_bytes()).unwrap(); - cur.write_all(&block_size.to_le_bytes()).unwrap(); - cur.write_all(&bit_depth.to_le_bytes()).unwrap(); - cur.write_all("data".as_bytes()).unwrap(); - cur.write_all(&bytes_size.to_le_bytes()).unwrap(); - - for value in wave { - let v = (value * volume_scale).clamp(-1., 1.); - let data = (v * 0x7fff as f32) as i16; - for _ in 0..repeat_count { - cur.write_all(&data.to_le_bytes()).unwrap(); + for (i, mora) in accent_phrase.moras().iter().enumerate() { + let value = (i as i32 == point + || (point < 0 + && i == (accent_phrase.moras().len() as i32 + point) as usize)) + .into(); + one_accent_list.push(value); + if mora.consonant().is_some() { + one_accent_list.push(value); + } + } + if accent_phrase.pause_mora().is_some() { + one_accent_list.push(0); } + accent_list.extend(one_accent_list) } + } - cur.into_inner() + /// AquesTalk風記法から[AudioQuery]を生成する。 + /// + /// # Example + /// + #[cfg_attr(windows, doc = "```no_run")] // https://github.com/VOICEVOX/voicevox_core/issues/537 + #[cfg_attr(not(windows), doc = "```")] + /// # #[tokio::main] + /// # async fn main() -> anyhow::Result<()> { + /// # let synthesizer = + /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( + /// # test_util::OPEN_JTALK_DIC_DIR, + /// # ) + /// # .await?; + /// # + /// use voicevox_core::StyleId; + /// + /// let audio_query = synthesizer + /// .audio_query_from_kana("コンニチワ'", StyleId::new(302)) + /// .await?; + /// # + /// # Ok(()) + /// # } + /// ``` + /// + /// [AudioQuery]: crate::AudioQueryModel + pub fn audio_query_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> Result { + let accent_phrases = self.create_accent_phrases_from_kana(kana, style_id)?; + Ok(AudioQueryModel::from_accent_phrases(accent_phrases) + .with_kana(Some(kana.to_owned()))) } - } - /// AquesTalk風記法からAccentPhrase (アクセント句)の配列を生成する。 - /// - /// # Example - /// - #[cfg_attr(windows, doc = "```no_run")] // https://github.com/VOICEVOX/voicevox_core/issues/537 - #[cfg_attr(not(windows), doc = "```")] - /// # #[tokio::main] - /// # async fn main() -> anyhow::Result<()> { - /// # let synthesizer = - /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( - /// # test_util::OPEN_JTALK_DIC_DIR, - /// # ) - /// # .await?; - /// # - /// use voicevox_core::StyleId; - /// - /// let accent_phrases = synthesizer - /// .create_accent_phrases_from_kana("コンニチワ'", StyleId::new(302)) - /// .await?; - /// # - /// # Ok(()) - /// # } - /// ``` - pub fn create_accent_phrases_from_kana( - &self, - kana: &str, - style_id: StyleId, - ) -> Result> { - self.replace_mora_data(&parse_kana(kana)?, style_id) + /// AquesTalk風記法から音声合成を行う。 + pub fn tts_from_kana( + &self, + kana: &str, + style_id: StyleId, + options: &TtsOptions, + ) -> Result> { + let audio_query = &self.audio_query_from_kana(kana, style_id)?; + self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) + } } -} -impl self::blocking::Synthesizer { - /// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。 - /// - /// # Example - /// - #[cfg_attr(windows, doc = "```no_run")] // https://github.com/VOICEVOX/voicevox_core/issues/537 - #[cfg_attr(not(windows), doc = "```")] - /// # #[tokio::main] - /// # async fn main() -> anyhow::Result<()> { - /// # let synthesizer = - /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( - /// # test_util::OPEN_JTALK_DIC_DIR, - /// # ) - /// # .await?; - /// # - /// use voicevox_core::StyleId; - /// - /// let accent_phrases = synthesizer - /// .create_accent_phrases("こんにちは", StyleId::new(302)) - /// .await?; - /// # - /// # Ok(()) - /// # } - /// ``` - pub fn create_accent_phrases( - &self, - text: &str, - style_id: StyleId, - ) -> Result> { - if text.is_empty() { - return Ok(Vec::new()); - } + impl self::Synthesizer { + /// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。 + /// + /// # Example + /// + #[cfg_attr(windows, doc = "```no_run")] // https://github.com/VOICEVOX/voicevox_core/issues/537 + #[cfg_attr(not(windows), doc = "```")] + /// # #[tokio::main] + /// # async fn main() -> anyhow::Result<()> { + /// # let synthesizer = + /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( + /// # test_util::OPEN_JTALK_DIC_DIR, + /// # ) + /// # .await?; + /// # + /// use voicevox_core::StyleId; + /// + /// let accent_phrases = synthesizer + /// .create_accent_phrases("こんにちは", StyleId::new(302)) + /// .await?; + /// # + /// # Ok(()) + /// # } + /// ``` + pub fn create_accent_phrases( + &self, + text: &str, + style_id: StyleId, + ) -> Result> { + if text.is_empty() { + return Ok(Vec::new()); + } - let utterance = Utterance::extract_full_context_label(&self.open_jtalk, text)?; + let utterance = Utterance::extract_full_context_label(&self.open_jtalk, text)?; - let accent_phrases: Vec = utterance - .breath_groups() - .iter() - .enumerate() - .fold(Vec::new(), |mut accum_vec, (i, breath_group)| { - accum_vec.extend(breath_group.accent_phrases().iter().enumerate().map( - |(j, accent_phrase)| { - let moras = accent_phrase - .moras() - .iter() - .map(|mora| { - let mora_text = mora - .phonemes() - .iter() - .map(|phoneme| phoneme.phoneme().to_string()) - .collect::>() - .join(""); - - let (consonant, consonant_length) = - if let Some(consonant) = mora.consonant() { - (Some(consonant.phoneme().to_string()), Some(0.)) - } else { - (None, None) - }; - - MoraModel::new( - mora_to_text(mora_text), - consonant, - consonant_length, - mora.vowel().phoneme().into(), + let accent_phrases: Vec = utterance + .breath_groups() + .iter() + .enumerate() + .fold(Vec::new(), |mut accum_vec, (i, breath_group)| { + accum_vec.extend(breath_group.accent_phrases().iter().enumerate().map( + |(j, accent_phrase)| { + let moras = accent_phrase + .moras() + .iter() + .map(|mora| { + let mora_text = mora + .phonemes() + .iter() + .map(|phoneme| phoneme.phoneme().to_string()) + .collect::>() + .join(""); + + let (consonant, consonant_length) = + if let Some(consonant) = mora.consonant() { + (Some(consonant.phoneme().to_string()), Some(0.)) + } else { + (None, None) + }; + + MoraModel::new( + mora_to_text(mora_text), + consonant, + consonant_length, + mora.vowel().phoneme().into(), + 0., + 0., + ) + }) + .collect(); + + let pause_mora = if i != utterance.breath_groups().len() - 1 + && j == breath_group.accent_phrases().len() - 1 + { + Some(MoraModel::new( + "、".into(), + None, + None, + "pau".into(), 0., 0., - ) - }) - .collect(); - - let pause_mora = if i != utterance.breath_groups().len() - 1 - && j == breath_group.accent_phrases().len() - 1 - { - Some(MoraModel::new( - "、".into(), - None, - None, - "pau".into(), - 0., - 0., - )) - } else { - None - }; - - AccentPhraseModel::new( - moras, - *accent_phrase.accent(), - pause_mora, - *accent_phrase.is_interrogative(), - ) - }, - )); + )) + } else { + None + }; + + AccentPhraseModel::new( + moras, + *accent_phrase.accent(), + pause_mora, + *accent_phrase.is_interrogative(), + ) + }, + )); + + accum_vec + }); + + self.replace_mora_data(&accent_phrases, style_id) + } - accum_vec - }); + /// 日本語のテキストから[AudioQuery]を生成する。 + /// + /// # Examples + /// + #[cfg_attr(windows, doc = "```no_run")] // https://github.com/VOICEVOX/voicevox_core/issues/537 + #[cfg_attr(not(windows), doc = "```")] + /// # #[tokio::main] + /// # async fn main() -> anyhow::Result<()> { + /// # let synthesizer = + /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( + /// # test_util::OPEN_JTALK_DIC_DIR, + /// # ) + /// # .await?; + /// # + /// use voicevox_core::StyleId; + /// + /// let audio_query = synthesizer + /// .audio_query("こんにちは", StyleId::new(302)) + /// .await?; + /// # + /// # Ok(()) + /// # } + /// ``` + /// + /// [AudioQuery]: crate::AudioQueryModel + pub fn audio_query(&self, text: &str, style_id: StyleId) -> Result { + let accent_phrases = self.create_accent_phrases(text, style_id)?; + Ok(AudioQueryModel::from_accent_phrases(accent_phrases)) + } - self.replace_mora_data(&accent_phrases, style_id) + /// 日本語のテキストから音声合成を行う。 + pub fn tts(&self, text: &str, style_id: StyleId, options: &TtsOptions) -> Result> { + let audio_query = &self.audio_query(text, style_id)?; + self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) + } } -} -impl self::blocking::Synthesizer { - /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。 - pub fn replace_mora_data( - &self, - accent_phrases: &[AccentPhraseModel], - style_id: StyleId, - ) -> Result> { - let accent_phrases = self.replace_phoneme_length(accent_phrases, style_id)?; - self.replace_mora_pitch(&accent_phrases, style_id) + pub trait PerformInference { + /// `predict_duration`を実行する。 + /// + /// # Performance + /// + /// CPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 + fn predict_duration(&self, phoneme_vector: &[i64], style_id: StyleId) -> Result>; + + /// `predict_intonation`を実行する。 + /// + /// # Performance + /// + /// CPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 + #[allow(clippy::too_many_arguments)] + fn predict_intonation( + &self, + length: usize, + vowel_phoneme_vector: &[i64], + consonant_phoneme_vector: &[i64], + start_accent_vector: &[i64], + end_accent_vector: &[i64], + start_accent_phrase_vector: &[i64], + end_accent_phrase_vector: &[i64], + style_id: StyleId, + ) -> Result>; + + /// `decode`を実行する。 + /// + /// # Performance + /// + /// CPU/GPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 + fn decode( + &self, + length: usize, + phoneme_size: usize, + f0: &[f32], + phoneme_vector: &[f32], + style_id: StyleId, + ) -> Result>; } - /// AccentPhraseの配列の音素長を、特定の声で生成しなおす。 - pub fn replace_phoneme_length( - &self, - accent_phrases: &[AccentPhraseModel], - style_id: StyleId, - ) -> Result> { - let (_, phoneme_data_list) = initial_process(accent_phrases); + impl PerformInference for self::Synthesizer { + fn predict_duration(&self, phoneme_vector: &[i64], style_id: StyleId) -> Result> { + // FIXME: `Status::ids_for`があるため、ここは不要なはず + if !self.status.validate_speaker_id(style_id) { + return Err(ErrorRepr::StyleNotFound { style_id }.into()); + } - let (_, _, vowel_indexes_data) = split_mora(&phoneme_data_list); + let (model_id, model_inner_id) = self.status.ids_for(style_id)?; + + let PredictDurationOutput { + phoneme_length: output, + } = self.status.run_session( + &model_id, + PredictDurationInput { + phoneme_list: ndarray::arr1(phoneme_vector), + speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), + }, + )?; + let mut output = output.into_raw_vec(); + + for output_item in output.iter_mut() { + if *output_item < PHONEME_LENGTH_MINIMAL { + *output_item = PHONEME_LENGTH_MINIMAL; + } + } - let phoneme_list_s: Vec = phoneme_data_list - .iter() - .map(|phoneme_data| phoneme_data.phoneme_id()) - .collect(); - let phoneme_length = self.predict_duration(&phoneme_list_s, style_id)?; + return Ok(output); - let mut index = 0; - let new_accent_phrases = accent_phrases - .iter() - .map(|accent_phrase| { - AccentPhraseModel::new( - accent_phrase - .moras() - .iter() - .map(|mora| { - let new_mora = MoraModel::new( - mora.text().clone(), - mora.consonant().clone(), - mora.consonant().as_ref().map(|_| { - phoneme_length[vowel_indexes_data[index + 1] as usize - 1] - }), - mora.vowel().clone(), - phoneme_length[vowel_indexes_data[index + 1] as usize], - *mora.pitch(), - ); - index += 1; - new_mora - }) - .collect(), - *accent_phrase.accent(), - accent_phrase.pause_mora().as_ref().map(|pause_mora| { - let new_pause_mora = MoraModel::new( - pause_mora.text().clone(), - pause_mora.consonant().clone(), - *pause_mora.consonant_length(), - pause_mora.vowel().clone(), - phoneme_length[vowel_indexes_data[index + 1] as usize], - *pause_mora.pitch(), - ); - index += 1; - new_pause_mora - }), - *accent_phrase.is_interrogative(), - ) - }) - .collect(); + const PHONEME_LENGTH_MINIMAL: f32 = 0.01; + } - Ok(new_accent_phrases) - } + fn predict_intonation( + &self, + length: usize, + vowel_phoneme_vector: &[i64], + consonant_phoneme_vector: &[i64], + start_accent_vector: &[i64], + end_accent_vector: &[i64], + start_accent_phrase_vector: &[i64], + end_accent_phrase_vector: &[i64], + style_id: StyleId, + ) -> Result> { + // FIXME: `Status::ids_for`があるため、ここは不要なはず + if !self.status.validate_speaker_id(style_id) { + return Err(ErrorRepr::StyleNotFound { style_id }.into()); + } - /// AccentPhraseの配列の音高を、特定の声で生成しなおす。 - pub fn replace_mora_pitch( - &self, - accent_phrases: &[AccentPhraseModel], - style_id: StyleId, - ) -> Result> { - let (_, phoneme_data_list) = initial_process(accent_phrases); - - let mut base_start_accent_list = vec![0]; - let mut base_end_accent_list = vec![0]; - let mut base_start_accent_phrase_list = vec![0]; - let mut base_end_accent_phrase_list = vec![0]; - for accent_phrase in accent_phrases { - let mut accent = usize::from(*accent_phrase.accent() != 1); - create_one_accent_list(&mut base_start_accent_list, accent_phrase, accent as i32); - - accent = *accent_phrase.accent() - 1; - create_one_accent_list(&mut base_end_accent_list, accent_phrase, accent as i32); - create_one_accent_list(&mut base_start_accent_phrase_list, accent_phrase, 0); - create_one_accent_list(&mut base_end_accent_phrase_list, accent_phrase, -1); + let (model_id, model_inner_id) = self.status.ids_for(style_id)?; + + let PredictIntonationOutput { f0_list: output } = self.status.run_session( + &model_id, + PredictIntonationInput { + length: ndarray::arr0(length as i64), + vowel_phoneme_list: ndarray::arr1(vowel_phoneme_vector), + consonant_phoneme_list: ndarray::arr1(consonant_phoneme_vector), + start_accent_list: ndarray::arr1(start_accent_vector), + end_accent_list: ndarray::arr1(end_accent_vector), + start_accent_phrase_list: ndarray::arr1(start_accent_phrase_vector), + end_accent_phrase_list: ndarray::arr1(end_accent_phrase_vector), + speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), + }, + )?; + + Ok(output.into_raw_vec()) } - base_start_accent_list.push(0); - base_end_accent_list.push(0); - base_start_accent_phrase_list.push(0); - base_end_accent_phrase_list.push(0); - let (consonant_phoneme_data_list, vowel_phoneme_data_list, vowel_indexes) = - split_mora(&phoneme_data_list); + fn decode( + &self, + length: usize, + phoneme_size: usize, + f0: &[f32], + phoneme_vector: &[f32], + style_id: StyleId, + ) -> Result> { + // FIXME: `Status::ids_for`があるため、ここは不要なはず + if !self.status.validate_speaker_id(style_id) { + return Err(ErrorRepr::StyleNotFound { style_id }.into()); + } - let consonant_phoneme_list: Vec = consonant_phoneme_data_list - .iter() - .map(|phoneme_data| phoneme_data.phoneme_id()) - .collect(); - let vowel_phoneme_list: Vec = vowel_phoneme_data_list - .iter() - .map(|phoneme_data| phoneme_data.phoneme_id()) - .collect(); + let (model_id, model_inner_id) = self.status.ids_for(style_id)?; + + // 音が途切れてしまうのを避けるworkaround処理が入っている + // TODO: 改善したらここのpadding処理を取り除く + const PADDING_SIZE: f64 = 0.4; + let padding_size = + ((PADDING_SIZE * DEFAULT_SAMPLING_RATE as f64) / 256.0).round() as usize; + let start_and_end_padding_size = 2 * padding_size; + let length_with_padding = length + start_and_end_padding_size; + let f0_with_padding = make_f0_with_padding(f0, length_with_padding, padding_size); + + let phoneme_with_padding = make_phoneme_with_padding( + phoneme_vector, + phoneme_size, + length_with_padding, + padding_size, + ); - let mut start_accent_list = Vec::with_capacity(vowel_indexes.len()); - let mut end_accent_list = Vec::with_capacity(vowel_indexes.len()); - let mut start_accent_phrase_list = Vec::with_capacity(vowel_indexes.len()); - let mut end_accent_phrase_list = Vec::with_capacity(vowel_indexes.len()); + let DecodeOutput { wave: output } = self.status.run_session( + &model_id, + DecodeInput { + f0: ndarray::arr1(&f0_with_padding) + .into_shape([length_with_padding, 1]) + .unwrap(), + phoneme: ndarray::arr1(&phoneme_with_padding) + .into_shape([length_with_padding, phoneme_size]) + .unwrap(), + speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), + }, + )?; + + return Ok(trim_padding_from_output( + output.into_raw_vec(), + padding_size, + )); + + fn make_f0_with_padding( + f0_slice: &[f32], + length_with_padding: usize, + padding_size: usize, + ) -> Vec { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let mut f0_with_padding = Vec::with_capacity(length_with_padding); + let padding = vec![0.0; padding_size]; + f0_with_padding.extend_from_slice(&padding); + f0_with_padding.extend_from_slice(f0_slice); + f0_with_padding.extend_from_slice(&padding); + f0_with_padding + } - for vowel_index in vowel_indexes { - start_accent_list.push(base_start_accent_list[vowel_index as usize]); - end_accent_list.push(base_end_accent_list[vowel_index as usize]); - start_accent_phrase_list.push(base_start_accent_phrase_list[vowel_index as usize]); - end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]); - } + fn make_phoneme_with_padding( + phoneme_slice: &[f32], + phoneme_size: usize, + length_with_padding: usize, + padding_size: usize, + ) -> Vec { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let mut padding_phoneme = vec![0.0; phoneme_size]; + padding_phoneme[0] = 1.0; + let padding_phoneme_len = padding_phoneme.len(); + let padding_phonemes: Vec = padding_phoneme + .into_iter() + .cycle() + .take(padding_phoneme_len * padding_size) + .collect(); + let mut phoneme_with_padding = + Vec::with_capacity(phoneme_size * length_with_padding); + phoneme_with_padding.extend_from_slice(&padding_phonemes); + phoneme_with_padding.extend_from_slice(phoneme_slice); + phoneme_with_padding.extend_from_slice(&padding_phonemes); + + phoneme_with_padding + } - let mut f0_list = self.predict_intonation( - vowel_phoneme_list.len(), - &vowel_phoneme_list, - &consonant_phoneme_list, - &start_accent_list, - &end_accent_list, - &start_accent_phrase_list, - &end_accent_phrase_list, - style_id, - )?; - - for i in 0..vowel_phoneme_data_list.len() { - const UNVOICED_MORA_PHONEME_LIST: &[&str] = &["A", "I", "U", "E", "O", "cl", "pau"]; - - if UNVOICED_MORA_PHONEME_LIST - .iter() - .any(|phoneme| *phoneme == vowel_phoneme_data_list[i].phoneme()) - { - f0_list[i] = 0.; + fn trim_padding_from_output(mut output: Vec, padding_f0_size: usize) -> Vec { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let padding_sampling_size = padding_f0_size * 256; + output + .drain(padding_sampling_size..output.len() - padding_sampling_size) + .collect() } } + } - let mut index = 0; - let new_accent_phrases = accent_phrases - .iter() - .map(|accent_phrase| { - AccentPhraseModel::new( - accent_phrase - .moras() - .iter() - .map(|mora| { - let new_mora = MoraModel::new( - mora.text().clone(), - mora.consonant().clone(), - *mora.consonant_length(), - mora.vowel().clone(), - *mora.vowel_length(), - f0_list[index + 1], - ); - index += 1; - new_mora - }) - .collect(), - *accent_phrase.accent(), - accent_phrase.pause_mora().as_ref().map(|pause_mora| { - let new_pause_mora = MoraModel::new( - pause_mora.text().clone(), - pause_mora.consonant().clone(), - *pause_mora.consonant_length(), - pause_mora.vowel().clone(), - *pause_mora.vowel_length(), - f0_list[index + 1], - ); - index += 1; - new_pause_mora - }), - *accent_phrase.is_interrogative(), - ) - }) - .collect(); + #[cfg(windows)] + fn list_windows_video_cards() { + use std::{ffi::OsString, os::windows::ffi::OsStringExt as _}; - return Ok(new_accent_phrases); - - fn create_one_accent_list( - accent_list: &mut Vec, - accent_phrase: &AccentPhraseModel, - point: i32, - ) { - let mut one_accent_list: Vec = Vec::new(); - - for (i, mora) in accent_phrase.moras().iter().enumerate() { - let value = (i as i32 == point - || (point < 0 && i == (accent_phrase.moras().len() as i32 + point) as usize)) - .into(); - one_accent_list.push(value); - if mora.consonant().is_some() { - one_accent_list.push(value); + use humansize::BINARY; + use tracing::{error, info}; + use windows::Win32::Graphics::Dxgi::{ + CreateDXGIFactory, IDXGIFactory, DXGI_ADAPTER_DESC, DXGI_ERROR_NOT_FOUND, + }; + + info!("検出されたGPU (DirectMLには1番目のGPUが使われます):"); + match list_windows_video_cards() { + Ok(descs) => { + for desc in descs { + let description = OsString::from_wide(trim_nul(&desc.Description)); + let vram = humansize::format_size(desc.DedicatedVideoMemory, BINARY); + info!(" - {description:?} ({vram})"); } } - if accent_phrase.pause_mora().is_some() { - one_accent_list.push(0); + Err(err) => error!("{err}"), + } + + fn list_windows_video_cards() -> windows::core::Result> { + #[allow(unsafe_code)] + unsafe { + let factory = CreateDXGIFactory::()?; + (0..) + .map(|i| factory.EnumAdapters(i)?.GetDesc()) + .take_while(|r| !matches!(r, Err(e) if e.code() == DXGI_ERROR_NOT_FOUND)) + .collect() } - accent_list.extend(one_accent_list) } - } - /// AquesTalk風記法から[AudioQuery]を生成する。 - /// - /// # Example - /// - #[cfg_attr(windows, doc = "```no_run")] // https://github.com/VOICEVOX/voicevox_core/issues/537 - #[cfg_attr(not(windows), doc = "```")] - /// # #[tokio::main] - /// # async fn main() -> anyhow::Result<()> { - /// # let synthesizer = - /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( - /// # test_util::OPEN_JTALK_DIC_DIR, - /// # ) - /// # .await?; - /// # - /// use voicevox_core::StyleId; - /// - /// let audio_query = synthesizer - /// .audio_query_from_kana("コンニチワ'", StyleId::new(302)) - /// .await?; - /// # - /// # Ok(()) - /// # } - /// ``` - /// - /// [AudioQuery]: crate::AudioQueryModel - pub fn audio_query_from_kana(&self, kana: &str, style_id: StyleId) -> Result { - let accent_phrases = self.create_accent_phrases_from_kana(kana, style_id)?; - Ok(AudioQueryModel::from_accent_phrases(accent_phrases).with_kana(Some(kana.to_owned()))) + fn trim_nul(s: &[u16]) -> &[u16] { + &s[..s.iter().position(|&c| c == 0x0000).unwrap_or(s.len())] + } } -} -impl self::blocking::Synthesizer { - /// 日本語のテキストから[AudioQuery]を生成する。 - /// - /// # Examples - /// - #[cfg_attr(windows, doc = "```no_run")] // https://github.com/VOICEVOX/voicevox_core/issues/537 - #[cfg_attr(not(windows), doc = "```")] - /// # #[tokio::main] - /// # async fn main() -> anyhow::Result<()> { - /// # let synthesizer = - /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( - /// # test_util::OPEN_JTALK_DIC_DIR, - /// # ) - /// # .await?; - /// # - /// use voicevox_core::StyleId; - /// - /// let audio_query = synthesizer - /// .audio_query("こんにちは", StyleId::new(302)) - /// .await?; - /// # - /// # Ok(()) - /// # } - /// ``` - /// - /// [AudioQuery]: crate::AudioQueryModel - pub fn audio_query(&self, text: &str, style_id: StyleId) -> Result { - let accent_phrases = self.create_accent_phrases(text, style_id)?; - Ok(AudioQueryModel::from_accent_phrases(accent_phrases)) - } -} + fn initial_process(accent_phrases: &[AccentPhraseModel]) -> (Vec, Vec) { + let flatten_moras = to_flatten_moras(accent_phrases); -impl self::blocking::Synthesizer { - /// AquesTalk風記法から音声合成を行う。 - pub fn tts_from_kana( - &self, - kana: &str, - style_id: StyleId, - options: &TtsOptions, - ) -> Result> { - let audio_query = &self.audio_query_from_kana(kana, style_id)?; - self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) - } -} + let mut phoneme_strings = vec!["pau".to_string()]; + for mora in flatten_moras.iter() { + if let Some(consonant) = mora.consonant() { + phoneme_strings.push(consonant.clone()) + } + phoneme_strings.push(mora.vowel().clone()); + } + phoneme_strings.push("pau".to_string()); -impl self::blocking::Synthesizer { - /// 日本語のテキストから音声合成を行う。 - pub fn tts(&self, text: &str, style_id: StyleId, options: &TtsOptions) -> Result> { - let audio_query = &self.audio_query(text, style_id)?; - self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) - } -} + let phoneme_data_list = to_phoneme_data_list(&phoneme_strings); -pub trait PerformInference { - /// `predict_duration`を実行する。 - /// - /// # Performance - /// - /// CPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 - fn predict_duration(&self, phoneme_vector: &[i64], style_id: StyleId) -> Result>; - - /// `predict_intonation`を実行する。 - /// - /// # Performance - /// - /// CPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 - #[allow(clippy::too_many_arguments)] - fn predict_intonation( - &self, - length: usize, - vowel_phoneme_vector: &[i64], - consonant_phoneme_vector: &[i64], - start_accent_vector: &[i64], - end_accent_vector: &[i64], - start_accent_phrase_vector: &[i64], - end_accent_phrase_vector: &[i64], - style_id: StyleId, - ) -> Result>; - - /// `decode`を実行する。 - /// - /// # Performance - /// - /// CPU/GPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 - fn decode( - &self, - length: usize, - phoneme_size: usize, - f0: &[f32], - phoneme_vector: &[f32], - style_id: StyleId, - ) -> Result>; -} + return (flatten_moras, phoneme_data_list); + + fn to_flatten_moras(accent_phrases: &[AccentPhraseModel]) -> Vec { + let mut flatten_moras = Vec::new(); + + for accent_phrase in accent_phrases { + let moras = accent_phrase.moras(); + for mora in moras { + flatten_moras.push(mora.clone()); + } + if let Some(pause_mora) = accent_phrase.pause_mora() { + flatten_moras.push(pause_mora.clone()); + } + } -impl PerformInference for self::blocking::Synthesizer { - fn predict_duration(&self, phoneme_vector: &[i64], style_id: StyleId) -> Result> { - // FIXME: `Status::ids_for`があるため、ここは不要なはず - if !self.status.validate_speaker_id(style_id) { - return Err(ErrorRepr::StyleNotFound { style_id }.into()); + flatten_moras } - let (model_id, model_inner_id) = self.status.ids_for(style_id)?; + fn to_phoneme_data_list>(phoneme_str_list: &[T]) -> Vec { + OjtPhoneme::convert( + phoneme_str_list + .iter() + .enumerate() + .map(|(i, s)| OjtPhoneme::new(s.as_ref().to_string(), i as f32, i as f32 + 1.)) + .collect::>() + .as_slice(), + ) + } + } - let PredictDurationOutput { - phoneme_length: output, - } = self.status.run_session( - &model_id, - PredictDurationInput { - phoneme_list: ndarray::arr1(phoneme_vector), - speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), - }, - )?; - let mut output = output.into_raw_vec(); + fn split_mora(phoneme_list: &[OjtPhoneme]) -> (Vec, Vec, Vec) { + let mut vowel_indexes = Vec::new(); + for (i, phoneme) in phoneme_list.iter().enumerate() { + const MORA_PHONEME_LIST: &[&str] = &[ + "a", "i", "u", "e", "o", "N", "A", "I", "U", "E", "O", "cl", "pau", + ]; - for output_item in output.iter_mut() { - if *output_item < PHONEME_LENGTH_MINIMAL { - *output_item = PHONEME_LENGTH_MINIMAL; + if MORA_PHONEME_LIST + .iter() + .any(|mora_phoneme| *mora_phoneme == phoneme.phoneme()) + { + vowel_indexes.push(i as i64); } } - return Ok(output); + let vowel_phoneme_list = vowel_indexes + .iter() + .map(|vowel_index| phoneme_list[*vowel_index as usize].clone()) + .collect(); + + let mut consonant_phoneme_list = vec![OjtPhoneme::default()]; + for i in 0..(vowel_indexes.len() - 1) { + let prev = vowel_indexes[i]; + let next = vowel_indexes[i + 1]; + if next - prev == 1 { + consonant_phoneme_list.push(OjtPhoneme::default()); + } else { + consonant_phoneme_list.push(phoneme_list[next as usize - 1].clone()); + } + } - const PHONEME_LENGTH_MINIMAL: f32 = 0.01; + (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes) } - fn predict_intonation( - &self, - length: usize, - vowel_phoneme_vector: &[i64], - consonant_phoneme_vector: &[i64], - start_accent_vector: &[i64], - end_accent_vector: &[i64], - start_accent_phrase_vector: &[i64], - end_accent_phrase_vector: &[i64], - style_id: StyleId, - ) -> Result> { - // FIXME: `Status::ids_for`があるため、ここは不要なはず - if !self.status.validate_speaker_id(style_id) { - return Err(ErrorRepr::StyleNotFound { style_id }.into()); + fn mora_to_text(mora: impl AsRef) -> String { + let last_char = mora.as_ref().chars().last().unwrap(); + let mora = if ['A', 'I', 'U', 'E', 'O'].contains(&last_char) { + format!( + "{}{}", + &mora.as_ref()[0..mora.as_ref().len() - 1], + last_char.to_lowercase() + ) + } else { + mora.as_ref().to_string() + }; + // もしカタカナに変換できなければ、引数で与えた文字列がそのまま返ってくる + engine::mora2text(&mora).to_string() + } + + impl AudioQueryModel { + fn from_accent_phrases(accent_phrases: Vec) -> Self { + let kana = create_kana(&accent_phrases); + Self::new( + accent_phrases, + 1., + 0., + 1., + 1., + 0.1, + 0.1, + DEFAULT_SAMPLING_RATE, + false, + Some(kana), + ) } + } +} - let (model_id, model_inner_id) = self.status.ids_for(style_id)?; - - let PredictIntonationOutput { f0_list: output } = self.status.run_session( - &model_id, - PredictIntonationInput { - length: ndarray::arr0(length as i64), - vowel_phoneme_list: ndarray::arr1(vowel_phoneme_vector), - consonant_phoneme_list: ndarray::arr1(consonant_phoneme_vector), - start_accent_list: ndarray::arr1(start_accent_vector), - end_accent_list: ndarray::arr1(end_accent_vector), - start_accent_phrase_list: ndarray::arr1(start_accent_phrase_vector), - end_accent_phrase_list: ndarray::arr1(end_accent_phrase_vector), - speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), - }, - )?; +pub(crate) mod tokio { + use std::sync::Arc; - Ok(output.into_raw_vec()) - } + use crate::{ + AccentPhraseModel, AudioQueryModel, FullcontextExtractor, Result, StyleId, + SynthesisOptions, VoiceModelId, VoiceModelMeta, + }; - fn decode( - &self, - length: usize, - phoneme_size: usize, - f0: &[f32], - phoneme_vector: &[f32], - style_id: StyleId, - ) -> Result> { - // FIXME: `Status::ids_for`があるため、ここは不要なはず - if !self.status.validate_speaker_id(style_id) { - return Err(ErrorRepr::StyleNotFound { style_id }.into()); - } + use super::{InitializeOptions, TtsOptions}; - let (model_id, model_inner_id) = self.status.ids_for(style_id)?; - - // 音が途切れてしまうのを避けるworkaround処理が入っている - // TODO: 改善したらここのpadding処理を取り除く - const PADDING_SIZE: f64 = 0.4; - let padding_size = ((PADDING_SIZE * DEFAULT_SAMPLING_RATE as f64) / 256.0).round() as usize; - let start_and_end_padding_size = 2 * padding_size; - let length_with_padding = length + start_and_end_padding_size; - let f0_with_padding = make_f0_with_padding(f0, length_with_padding, padding_size); - - let phoneme_with_padding = make_phoneme_with_padding( - phoneme_vector, - phoneme_size, - length_with_padding, - padding_size, - ); + /// 音声シンセサイザ。 + #[derive(Clone)] + pub struct Synthesizer(pub(super) Arc>); - let DecodeOutput { wave: output } = self.status.run_session( - &model_id, - DecodeInput { - f0: ndarray::arr1(&f0_with_padding) - .into_shape([length_with_padding, 1]) - .unwrap(), - phoneme: ndarray::arr1(&phoneme_with_padding) - .into_shape([length_with_padding, phoneme_size]) - .unwrap(), - speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), - }, - )?; - - return Ok(trim_padding_from_output( - output.into_raw_vec(), - padding_size, - )); - - fn make_f0_with_padding( - f0_slice: &[f32], - length_with_padding: usize, - padding_size: usize, - ) -> Vec { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let mut f0_with_padding = Vec::with_capacity(length_with_padding); - let padding = vec![0.0; padding_size]; - f0_with_padding.extend_from_slice(&padding); - f0_with_padding.extend_from_slice(f0_slice); - f0_with_padding.extend_from_slice(&padding); - f0_with_padding + // FIXME: docを書く + impl self::Synthesizer { + pub fn new(open_jtalk: O, options: &InitializeOptions) -> Result { + super::blocking::Synthesizer::new(open_jtalk, options) + .map(Into::into) + .map(Self) } - fn make_phoneme_with_padding( - phoneme_slice: &[f32], - phoneme_size: usize, - length_with_padding: usize, - padding_size: usize, - ) -> Vec { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let mut padding_phoneme = vec![0.0; phoneme_size]; - padding_phoneme[0] = 1.0; - let padding_phoneme_len = padding_phoneme.len(); - let padding_phonemes: Vec = padding_phoneme - .into_iter() - .cycle() - .take(padding_phoneme_len * padding_size) - .collect(); - let mut phoneme_with_padding = Vec::with_capacity(phoneme_size * length_with_padding); - phoneme_with_padding.extend_from_slice(&padding_phonemes); - phoneme_with_padding.extend_from_slice(phoneme_slice); - phoneme_with_padding.extend_from_slice(&padding_phonemes); - - phoneme_with_padding + pub fn is_gpu_mode(&self) -> bool { + self.0.is_gpu_mode() } - fn trim_padding_from_output(mut output: Vec, padding_f0_size: usize) -> Vec { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let padding_sampling_size = padding_f0_size * 256; - output - .drain(padding_sampling_size..output.len() - padding_sampling_size) - .collect() + pub async fn load_voice_model(&self, model: &crate::tokio::VoiceModel) -> Result<()> { + let model_bytes = &model.read_inference_models().await?; + self.0.status.insert_model(model.header(), model_bytes) } - } -} -#[cfg(windows)] -fn list_windows_video_cards() { - use std::{ffi::OsString, os::windows::ffi::OsStringExt as _}; + pub fn unload_voice_model(&self, voice_model_id: &VoiceModelId) -> Result<()> { + self.0.unload_voice_model(voice_model_id) + } - use humansize::BINARY; - use tracing::{error, info}; - use windows::Win32::Graphics::Dxgi::{ - CreateDXGIFactory, IDXGIFactory, DXGI_ADAPTER_DESC, DXGI_ERROR_NOT_FOUND, - }; + pub fn is_loaded_voice_model(&self, voice_model_id: &VoiceModelId) -> bool { + self.0.is_loaded_voice_model(voice_model_id) + } - info!("検出されたGPU (DirectMLには1番目のGPUが使われます):"); - match list_windows_video_cards() { - Ok(descs) => { - for desc in descs { - let description = OsString::from_wide(trim_nul(&desc.Description)); - let vram = humansize::format_size(desc.DedicatedVideoMemory, BINARY); - info!(" - {description:?} ({vram})"); - } + #[doc(hidden)] + pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { + self.0.is_loaded_model_by_style_id(style_id) } - Err(err) => error!("{err}"), - } - fn list_windows_video_cards() -> windows::core::Result> { - #[allow(unsafe_code)] - unsafe { - let factory = CreateDXGIFactory::()?; - (0..) - .map(|i| factory.EnumAdapters(i)?.GetDesc()) - .take_while(|r| !matches!(r, Err(e) if e.code() == DXGI_ERROR_NOT_FOUND)) - .collect() + pub fn metas(&self) -> VoiceModelMeta { + self.0.metas() } - } - fn trim_nul(s: &[u16]) -> &[u16] { - &s[..s.iter().position(|&c| c == 0x0000).unwrap_or(s.len())] - } -} + pub async fn synthesis( + &self, + audio_query: &AudioQueryModel, + style_id: StyleId, + options: &SynthesisOptions, + ) -> Result> { + let blocking = self.0.clone(); + let audio_query = audio_query.clone(); + let options = options.clone(); + + crate::task::asyncify(move || blocking.synthesis(&audio_query, style_id, &options)) + .await + } -fn initial_process(accent_phrases: &[AccentPhraseModel]) -> (Vec, Vec) { - let flatten_moras = to_flatten_moras(accent_phrases); + pub async fn create_accent_phrases_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> Result> { + let blocking = self.0.clone(); + let kana = kana.to_owned(); - let mut phoneme_strings = vec!["pau".to_string()]; - for mora in flatten_moras.iter() { - if let Some(consonant) = mora.consonant() { - phoneme_strings.push(consonant.clone()) + crate::task::asyncify(move || blocking.create_accent_phrases_from_kana(&kana, style_id)) + .await } - phoneme_strings.push(mora.vowel().clone()); - } - phoneme_strings.push("pau".to_string()); - let phoneme_data_list = to_phoneme_data_list(&phoneme_strings); + pub async fn replace_mora_data( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let blocking = self.0.clone(); + let accent_phrases = accent_phrases.to_owned(); - return (flatten_moras, phoneme_data_list); + crate::task::asyncify(move || blocking.replace_mora_data(&accent_phrases, style_id)) + .await + } - fn to_flatten_moras(accent_phrases: &[AccentPhraseModel]) -> Vec { - let mut flatten_moras = Vec::new(); + pub async fn replace_phoneme_length( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let blocking = self.0.clone(); + let accent_phrases = accent_phrases.to_owned(); - for accent_phrase in accent_phrases { - let moras = accent_phrase.moras(); - for mora in moras { - flatten_moras.push(mora.clone()); - } - if let Some(pause_mora) = accent_phrase.pause_mora() { - flatten_moras.push(pause_mora.clone()); - } + crate::task::asyncify(move || { + blocking.replace_phoneme_length(&accent_phrases, style_id) + }) + .await } - flatten_moras - } + pub async fn replace_mora_pitch( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let blocking = self.0.clone(); + let accent_phrases = accent_phrases.to_owned(); - fn to_phoneme_data_list>(phoneme_str_list: &[T]) -> Vec { - OjtPhoneme::convert( - phoneme_str_list - .iter() - .enumerate() - .map(|(i, s)| OjtPhoneme::new(s.as_ref().to_string(), i as f32, i as f32 + 1.)) - .collect::>() - .as_slice(), - ) - } -} + crate::task::asyncify(move || blocking.replace_mora_pitch(&accent_phrases, style_id)) + .await + } -fn split_mora(phoneme_list: &[OjtPhoneme]) -> (Vec, Vec, Vec) { - let mut vowel_indexes = Vec::new(); - for (i, phoneme) in phoneme_list.iter().enumerate() { - const MORA_PHONEME_LIST: &[&str] = &[ - "a", "i", "u", "e", "o", "N", "A", "I", "U", "E", "O", "cl", "pau", - ]; + pub async fn audio_query_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> Result { + let blocking = self.0.clone(); + let kana = kana.to_owned(); - if MORA_PHONEME_LIST - .iter() - .any(|mora_phoneme| *mora_phoneme == phoneme.phoneme()) - { - vowel_indexes.push(i as i64); + crate::task::asyncify(move || blocking.audio_query_from_kana(&kana, style_id)).await } - } - let vowel_phoneme_list = vowel_indexes - .iter() - .map(|vowel_index| phoneme_list[*vowel_index as usize].clone()) - .collect(); - - let mut consonant_phoneme_list = vec![OjtPhoneme::default()]; - for i in 0..(vowel_indexes.len() - 1) { - let prev = vowel_indexes[i]; - let next = vowel_indexes[i + 1]; - if next - prev == 1 { - consonant_phoneme_list.push(OjtPhoneme::default()); - } else { - consonant_phoneme_list.push(phoneme_list[next as usize - 1].clone()); + pub async fn tts_from_kana( + &self, + kana: &str, + style_id: StyleId, + options: &TtsOptions, + ) -> Result> { + let blocking = self.0.clone(); + let kana = kana.to_owned(); + let options = options.clone(); + + crate::task::asyncify(move || blocking.tts_from_kana(&kana, style_id, &options)).await } } - (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes) -} + impl self::Synthesizer { + pub async fn create_accent_phrases( + &self, + text: &str, + style_id: StyleId, + ) -> Result> { + let blocking = self.0.clone(); + let text = text.to_owned(); -fn mora_to_text(mora: impl AsRef) -> String { - let last_char = mora.as_ref().chars().last().unwrap(); - let mora = if ['A', 'I', 'U', 'E', 'O'].contains(&last_char) { - format!( - "{}{}", - &mora.as_ref()[0..mora.as_ref().len() - 1], - last_char.to_lowercase() - ) - } else { - mora.as_ref().to_string() - }; - // もしカタカナに変換できなければ、引数で与えた文字列がそのまま返ってくる - engine::mora2text(&mora).to_string() -} - -impl AudioQueryModel { - fn from_accent_phrases(accent_phrases: Vec) -> Self { - let kana = create_kana(&accent_phrases); - Self::new( - accent_phrases, - 1., - 0., - 1., - 1., - 0.1, - 0.1, - DEFAULT_SAMPLING_RATE, - false, - Some(kana), - ) - } -} + crate::task::asyncify(move || blocking.create_accent_phrases(&text, style_id)).await + } -pub(crate) mod blocking { - use crate::infer::{domain::InferenceDomainImpl, status::Status}; + pub async fn audio_query(&self, text: &str, style_id: StyleId) -> Result { + let blocking = self.0.clone(); + let text = text.to_owned(); - use super::InferenceRuntimeImpl; + crate::task::asyncify(move || blocking.audio_query(&text, style_id)).await + } - /// 音声シンセサイザ。 - pub struct Synthesizer { - pub(super) status: Status, - pub(super) open_jtalk: O, - pub(super) use_gpu: bool, + pub async fn tts( + &self, + text: &str, + style_id: StyleId, + options: &TtsOptions, + ) -> Result> { + let blocking = self.0.clone(); + let text = text.to_owned(); + let options = options.clone(); + + crate::task::asyncify(move || blocking.tts(&text, style_id, &options)).await + } } } -pub(crate) mod tokio { - use std::sync::Arc; - - /// 音声シンセサイザ。 - #[derive(Clone)] - pub struct Synthesizer(pub(super) Arc>); -} - #[cfg(test)] mod tests { - use super::{AccelerationMode, InitializeOptions, PerformInference as _}; + use super::{blocking::PerformInference as _, AccelerationMode, InitializeOptions}; use crate::{ engine::MoraModel, macros::tests::assert_debug_fmt_eq, test_util::open_default_vvm_file, AccentPhraseModel, Result, StyleId, diff --git a/crates/voicevox_core/src/user_dict/dict.rs b/crates/voicevox_core/src/user_dict/dict.rs index 5743b678f..6997620f0 100644 --- a/crates/voicevox_core/src/user_dict/dict.rs +++ b/crates/voicevox_core/src/user_dict/dict.rs @@ -1,174 +1,178 @@ -use indexmap::IndexMap; -use itertools::join; -use uuid::Uuid; +pub(crate) mod blocking { + use indexmap::IndexMap; + use itertools::join; + use uuid::Uuid; -use crate::{error::ErrorRepr, Result, UserDictWord}; + use crate::{error::ErrorRepr, Result}; -impl self::blocking::UserDict { - /// ユーザー辞書を作成する。 - pub fn new() -> Self { - Default::default() - } + use super::super::word::UserDictWord; - pub fn to_json(&self) -> String { - serde_json::to_string(&*self.words.lock().unwrap()).expect("should not fail") + /// ユーザー辞書。 + /// + /// 単語はJSONとの相互変換のために挿入された順序を保つ。 + #[derive(Debug, Default)] + pub struct UserDict { + words: std::sync::Mutex>, } - pub fn with_words(&self, f: impl FnOnce(&IndexMap) -> R) -> R { - f(&self.words.lock().unwrap()) - } + impl self::UserDict { + /// ユーザー辞書を作成する。 + pub fn new() -> Self { + Default::default() + } - /// ユーザー辞書をファイルから読み込む。 - /// - /// # Errors - /// - /// ファイルが読めなかった、または内容が不正だった場合はエラーを返す。 - pub fn load(&self, store_path: &str) -> Result<()> { - let words = (|| { - let words = &fs_err::read(store_path)?; - let words = serde_json::from_slice::>(words)?; - Ok(words) - })() - .map_err(ErrorRepr::LoadUserDict)?; - - self.words.lock().unwrap().extend(words); - Ok(()) - } + pub fn to_json(&self) -> String { + serde_json::to_string(&*self.words.lock().unwrap()).expect("should not fail") + } - /// ユーザー辞書に単語を追加する。 - pub fn add_word(&self, word: UserDictWord) -> Result { - let word_uuid = Uuid::new_v4(); - self.words.lock().unwrap().insert(word_uuid, word); - Ok(word_uuid) - } + pub fn with_words(&self, f: impl FnOnce(&IndexMap) -> R) -> R { + f(&self.words.lock().unwrap()) + } - /// ユーザー辞書の単語を変更する。 - pub fn update_word(&self, word_uuid: Uuid, new_word: UserDictWord) -> Result<()> { - let mut words = self.words.lock().unwrap(); - if !words.contains_key(&word_uuid) { - return Err(ErrorRepr::WordNotFound(word_uuid).into()); + /// ユーザー辞書をファイルから読み込む。 + /// + /// # Errors + /// + /// ファイルが読めなかった、または内容が不正だった場合はエラーを返す。 + pub fn load(&self, store_path: &str) -> Result<()> { + let words = (|| { + let words = &fs_err::read(store_path)?; + let words = serde_json::from_slice::>(words)?; + Ok(words) + })() + .map_err(ErrorRepr::LoadUserDict)?; + + self.words.lock().unwrap().extend(words); + Ok(()) } - words.insert(word_uuid, new_word); - Ok(()) - } - /// ユーザー辞書から単語を削除する。 - pub fn remove_word(&self, word_uuid: Uuid) -> Result { - let Some(word) = self.words.lock().unwrap().remove(&word_uuid) else { - return Err(ErrorRepr::WordNotFound(word_uuid).into()); - }; - Ok(word) - } + /// ユーザー辞書に単語を追加する。 + pub fn add_word(&self, word: UserDictWord) -> Result { + let word_uuid = Uuid::new_v4(); + self.words.lock().unwrap().insert(word_uuid, word); + Ok(word_uuid) + } - /// 他のユーザー辞書をインポートする。 - pub fn import(&self, other: &Self) -> Result<()> { - for (word_uuid, word) in &*other.words.lock().unwrap() { - self.words.lock().unwrap().insert(*word_uuid, word.clone()); + /// ユーザー辞書の単語を変更する。 + pub fn update_word(&self, word_uuid: Uuid, new_word: UserDictWord) -> Result<()> { + let mut words = self.words.lock().unwrap(); + if !words.contains_key(&word_uuid) { + return Err(ErrorRepr::WordNotFound(word_uuid).into()); + } + words.insert(word_uuid, new_word); + Ok(()) } - Ok(()) - } - /// ユーザー辞書を保存する。 - pub fn save(&self, store_path: &str) -> Result<()> { - fs_err::write( - store_path, - serde_json::to_vec(&self.words).expect("should not fail"), - ) - .map_err(|e| ErrorRepr::SaveUserDict(e.into()).into()) - } + /// ユーザー辞書から単語を削除する。 + pub fn remove_word(&self, word_uuid: Uuid) -> Result { + let Some(word) = self.words.lock().unwrap().remove(&word_uuid) else { + return Err(ErrorRepr::WordNotFound(word_uuid).into()); + }; + Ok(word) + } + + /// 他のユーザー辞書をインポートする。 + pub fn import(&self, other: &Self) -> Result<()> { + for (word_uuid, word) in &*other.words.lock().unwrap() { + self.words.lock().unwrap().insert(*word_uuid, word.clone()); + } + Ok(()) + } + + /// ユーザー辞書を保存する。 + pub fn save(&self, store_path: &str) -> Result<()> { + fs_err::write( + store_path, + serde_json::to_vec(&self.words).expect("should not fail"), + ) + .map_err(|e| ErrorRepr::SaveUserDict(e.into()).into()) + } - /// MeCabで使用する形式に変換する。 - pub(crate) fn to_mecab_format(&self) -> String { - join( - self.words - .lock() - .unwrap() - .values() - .map(UserDictWord::to_mecab_format), - "\n", - ) + /// MeCabで使用する形式に変換する。 + pub(crate) fn to_mecab_format(&self) -> String { + join( + self.words + .lock() + .unwrap() + .values() + .map(UserDictWord::to_mecab_format), + "\n", + ) + } } } -impl self::tokio::UserDict { - /// ユーザー辞書を作成する。 - pub fn new() -> Self { - Self(self::blocking::UserDict::new().into()) - } +pub(crate) mod tokio { + use std::sync::Arc; - pub fn to_json(&self) -> String { - self.0.to_json() - } + use indexmap::IndexMap; + use uuid::Uuid; - pub fn with_words(&self, f: impl FnOnce(&IndexMap) -> R) -> R { - self.0.with_words(f) - } + use crate::Result; - /// ユーザー辞書をファイルから読み込む。 - /// - /// # Errors - /// - /// ファイルが読めなかった、または内容が不正だった場合はエラーを返す。 - pub async fn load(&self, store_path: &str) -> Result<()> { - let blocking = self.0.clone(); - let store_path = store_path.to_owned(); - crate::task::asyncify(move || blocking.load(&store_path)).await - } + use super::super::word::UserDictWord; - /// ユーザー辞書に単語を追加する。 - pub fn add_word(&self, word: UserDictWord) -> Result { - self.0.add_word(word) - } + /// ユーザー辞書。 + /// + /// 単語はJSONとの相互変換のために挿入された順序を保つ。 + #[derive(Debug, Default)] + pub struct UserDict(Arc); - /// ユーザー辞書の単語を変更する。 - pub fn update_word(&self, word_uuid: Uuid, new_word: UserDictWord) -> Result<()> { - self.0.update_word(word_uuid, new_word) - } + impl self::UserDict { + /// ユーザー辞書を作成する。 + pub fn new() -> Self { + Self(super::blocking::UserDict::new().into()) + } - /// ユーザー辞書から単語を削除する。 - pub fn remove_word(&self, word_uuid: Uuid) -> Result { - self.0.remove_word(word_uuid) - } + pub fn to_json(&self) -> String { + self.0.to_json() + } - /// 他のユーザー辞書をインポートする。 - pub fn import(&self, other: &Self) -> Result<()> { - self.0.import(&other.0) - } + pub fn with_words(&self, f: impl FnOnce(&IndexMap) -> R) -> R { + self.0.with_words(f) + } - /// ユーザー辞書を保存する。 - pub async fn save(&self, store_path: &str) -> Result<()> { - let blocking = self.0.clone(); - let store_path = store_path.to_owned(); - crate::task::asyncify(move || blocking.save(&store_path)).await - } + /// ユーザー辞書をファイルから読み込む。 + /// + /// # Errors + /// + /// ファイルが読めなかった、または内容が不正だった場合はエラーを返す。 + pub async fn load(&self, store_path: &str) -> Result<()> { + let blocking = self.0.clone(); + let store_path = store_path.to_owned(); + crate::task::asyncify(move || blocking.load(&store_path)).await + } - /// MeCabで使用する形式に変換する。 - pub(crate) fn to_mecab_format(&self) -> String { - self.0.to_mecab_format() - } -} + /// ユーザー辞書に単語を追加する。 + pub fn add_word(&self, word: UserDictWord) -> Result { + self.0.add_word(word) + } -pub(crate) mod blocking { - use indexmap::IndexMap; - use uuid::Uuid; + /// ユーザー辞書の単語を変更する。 + pub fn update_word(&self, word_uuid: Uuid, new_word: UserDictWord) -> Result<()> { + self.0.update_word(word_uuid, new_word) + } - use super::UserDictWord; + /// ユーザー辞書から単語を削除する。 + pub fn remove_word(&self, word_uuid: Uuid) -> Result { + self.0.remove_word(word_uuid) + } - /// ユーザー辞書。 - /// - /// 単語はJSONとの相互変換のために挿入された順序を保つ。 - #[derive(Debug, Default)] - pub struct UserDict { - pub(super) words: std::sync::Mutex>, - } -} + /// 他のユーザー辞書をインポートする。 + pub fn import(&self, other: &Self) -> Result<()> { + self.0.import(&other.0) + } -pub(crate) mod tokio { - use std::sync::Arc; + /// ユーザー辞書を保存する。 + pub async fn save(&self, store_path: &str) -> Result<()> { + let blocking = self.0.clone(); + let store_path = store_path.to_owned(); + crate::task::asyncify(move || blocking.save(&store_path)).await + } - /// ユーザー辞書。 - /// - /// 単語はJSONとの相互変換のために挿入された順序を保つ。 - #[derive(Debug, Default)] - pub struct UserDict(pub(super) Arc); + /// MeCabで使用する形式に変換する。 + pub(crate) fn to_mecab_format(&self) -> String { + self.0.to_mecab_format() + } + } } diff --git a/crates/voicevox_core/src/voice_model.rs b/crates/voicevox_core/src/voice_model.rs index e0a080bfc..96bf481d1 100644 --- a/crates/voicevox_core/src/voice_model.rs +++ b/crates/voicevox_core/src/voice_model.rs @@ -1,23 +1,12 @@ use derive_getters::Getters; use derive_new::new; -use enum_map::EnumMap; -use futures::future::join3; -use nanoid::nanoid; -use ouroboros::self_referencing; -use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; -use serde::{de::DeserializeOwned, Deserialize}; +use serde::Deserialize; use crate::{ - error::{LoadModelError, LoadModelErrorKind, LoadModelResult}, - infer::domain::InferenceOperationImpl, manifest::{Manifest, ModelInnerId}, - Result, SpeakerMeta, StyleId, StyleMeta, VoiceModelMeta, -}; -use std::{ - collections::{BTreeMap, HashMap}, - io::{self, Cursor}, - path::{Path, PathBuf}, + SpeakerMeta, StyleId, StyleMeta, VoiceModelMeta, }; +use std::{collections::BTreeMap, path::PathBuf}; /// [`VoiceModelId`]の実体。 /// @@ -32,234 +21,6 @@ pub struct VoiceModelId { raw_voice_model_id: RawVoiceModelId, } -impl self::blocking::VoiceModel { - pub(crate) fn read_inference_models( - &self, - ) -> LoadModelResult>> { - let reader = BlockingVvmEntryReader::open(&self.header.path)?; - - let model_bytes = [ - self.header.manifest.predict_duration_filename(), - self.header.manifest.predict_intonation_filename(), - self.header.manifest.decode_filename(), - ] - .into_par_iter() - .map(|filename| reader.read_vvm_entry(filename)) - .collect::, _>>()? - .try_into() - .unwrap_or_else(|_| panic!("should be same length")); - - Ok(EnumMap::from_array(model_bytes)) - } - - /// VVMファイルから`VoiceModel`をコンストラクトする。 - pub fn from_path(path: impl AsRef) -> crate::Result { - let path = path.as_ref().to_owned(); - let reader = BlockingVvmEntryReader::open(&path)?; - let manifest = reader.read_vvm_json::("manifest.json")?; - let metas = reader.read_vvm_json(manifest.metas_filename())?; - let id = VoiceModelId::new(nanoid!()); - - Ok(Self { - header: VoiceModelHeader { - id, - metas, - manifest, - path, - }, - }) - } - - /// ID。 - pub fn id(&self) -> &VoiceModelId { - &self.header.id - } - - /// メタ情報。 - pub fn metas(&self) -> &VoiceModelMeta { - &self.header.metas - } - - pub(crate) fn header(&self) -> &VoiceModelHeader { - &self.header - } -} - -#[self_referencing] -struct BlockingVvmEntryReader { - path: PathBuf, - zip: Vec, - #[covariant] - #[borrows(zip)] - reader: zip::ZipArchive>, -} - -impl BlockingVvmEntryReader { - fn open(path: &Path) -> LoadModelResult { - (|| { - let zip = std::fs::read(path)?; - Self::try_new(path.to_owned(), zip, |zip| { - zip::ZipArchive::new(Cursor::new(zip)) - }) - })() - .map_err(|source| LoadModelError { - path: path.to_owned(), - context: LoadModelErrorKind::OpenZipFile, - source: Some(source.into()), - }) - } - - fn read_vvm_json(&self, filename: &str) -> LoadModelResult { - let bytes = &self.read_vvm_entry(filename)?; - serde_json::from_slice(bytes).map_err(|source| LoadModelError { - path: self.borrow_path().clone(), - context: LoadModelErrorKind::OpenZipFile, - source: Some(source.into()), - }) - } - - fn read_vvm_entry(&self, filename: &str) -> LoadModelResult> { - (|| { - let mut reader = self.borrow_reader().clone(); - let mut entry = reader.by_name(filename)?; - let mut buf = Vec::with_capacity(entry.size() as _); - io::copy(&mut entry, &mut buf)?; - Ok(buf) - })() - .map_err(|source| LoadModelError { - path: self.borrow_path().clone(), - context: LoadModelErrorKind::OpenZipFile, - source: Some(source), - }) - } -} - -impl self::tokio::VoiceModel { - pub(crate) async fn read_inference_models( - &self, - ) -> LoadModelResult>> { - let reader = AsyncVvmEntryReader::open(&self.header.path).await?; - let (decode_model_result, predict_duration_model_result, predict_intonation_model_result) = - join3( - reader.read_vvm_entry(self.header.manifest.decode_filename()), - reader.read_vvm_entry(self.header.manifest.predict_duration_filename()), - reader.read_vvm_entry(self.header.manifest.predict_intonation_filename()), - ) - .await; - - Ok(EnumMap::from_array([ - predict_duration_model_result?, - predict_intonation_model_result?, - decode_model_result?, - ])) - } - /// VVMファイルから`VoiceModel`をコンストラクトする。 - pub async fn from_path(path: impl AsRef) -> Result { - let reader = AsyncVvmEntryReader::open(path.as_ref()).await?; - let manifest = reader.read_vvm_json::("manifest.json").await?; - let metas = reader - .read_vvm_json::(manifest.metas_filename()) - .await?; - let id = VoiceModelId::new(nanoid!()); - - Ok(Self { - header: VoiceModelHeader { - id, - metas, - manifest, - path: path.as_ref().into(), - }, - }) - } - - /// ID。 - pub fn id(&self) -> &VoiceModelId { - &self.header.id - } - - /// メタ情報。 - pub fn metas(&self) -> &VoiceModelMeta { - &self.header.metas - } - - pub(crate) fn header(&self) -> &VoiceModelHeader { - &self.header - } -} - -struct AsyncVvmEntry { - index: usize, - entry: async_zip::ZipEntry, -} - -#[derive(new)] -struct AsyncVvmEntryReader { - reader: async_zip::read::fs::ZipFileReader, - entry_map: HashMap, -} - -impl AsyncVvmEntryReader { - async fn open(path: &Path) -> LoadModelResult { - let reader = async_zip::read::fs::ZipFileReader::new(path) - .await - .map_err(|source| LoadModelError { - path: path.to_owned(), - context: LoadModelErrorKind::OpenZipFile, - source: Some(source.into()), - })?; - let entry_map: HashMap<_, _> = reader - .file() - .entries() - .iter() - .filter(|e| !e.entry().dir()) - .enumerate() - .map(|(i, e)| { - ( - e.entry().filename().to_string(), - AsyncVvmEntry { - index: i, - entry: e.entry().clone(), - }, - ) - }) - .collect(); - Ok(AsyncVvmEntryReader::new(reader, entry_map)) - } - async fn read_vvm_json(&self, filename: &str) -> LoadModelResult { - let bytes = self.read_vvm_entry(filename).await?; - serde_json::from_slice(&bytes).map_err(|source| LoadModelError { - path: self.reader.path().to_owned(), - context: LoadModelErrorKind::ReadZipEntry { - filename: filename.to_owned(), - }, - source: Some(source.into()), - }) - } - - async fn read_vvm_entry(&self, filename: &str) -> LoadModelResult> { - async { - let me = self - .entry_map - .get(filename) - .ok_or_else(|| io::Error::from(io::ErrorKind::NotFound))?; - let mut manifest_reader = self.reader.entry(me.index).await?; - let mut buf = Vec::with_capacity(me.entry.uncompressed_size() as usize); - manifest_reader - .read_to_end_checked(&mut buf, &me.entry) - .await?; - Ok::<_, anyhow::Error>(buf) - } - .await - .map_err(|source| LoadModelError { - path: self.reader.path().to_owned(), - context: LoadModelErrorKind::ReadZipEntry { - filename: filename.to_owned(), - }, - source: Some(source), - }) - } -} - // FIXME: "header"といいつつ、VVMのファイルパスを持っている状態になっている。 /// 音声モデルが持つ、各モデルファイルの実体を除く情報。 /// @@ -297,25 +58,289 @@ impl VoiceModelHeader { } pub(crate) mod blocking { - use super::VoiceModelHeader; + use std::{ + io::{self, Cursor}, + path::Path, + }; + + use enum_map::EnumMap; + use nanoid::nanoid; + use ouroboros::self_referencing; + use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; + use serde::de::DeserializeOwned; + + use crate::{ + error::{LoadModelError, LoadModelErrorKind, LoadModelResult}, + infer::domain::InferenceOperationImpl, + manifest::Manifest, + VoiceModelMeta, + }; + + use super::{VoiceModelHeader, VoiceModelId}; /// 音声モデル。 /// /// VVMファイルと対応する。 #[derive(Clone)] pub struct VoiceModel { - pub(super) header: VoiceModelHeader, + header: VoiceModelHeader, + } + + impl self::VoiceModel { + pub(crate) fn read_inference_models( + &self, + ) -> LoadModelResult>> { + let reader = BlockingVvmEntryReader::open(&self.header.path)?; + + let model_bytes = [ + self.header.manifest.predict_duration_filename(), + self.header.manifest.predict_intonation_filename(), + self.header.manifest.decode_filename(), + ] + .into_par_iter() + .map(|filename| reader.read_vvm_entry(filename)) + .collect::, _>>()? + .try_into() + .unwrap_or_else(|_| panic!("should be same length")); + + Ok(EnumMap::from_array(model_bytes)) + } + + /// VVMファイルから`VoiceModel`をコンストラクトする。 + pub fn from_path(path: impl AsRef) -> crate::Result { + let path = path.as_ref().to_owned(); + let reader = BlockingVvmEntryReader::open(&path)?; + let manifest = reader.read_vvm_json::("manifest.json")?; + let metas = reader.read_vvm_json(manifest.metas_filename())?; + let id = VoiceModelId::new(nanoid!()); + + Ok(Self { + header: VoiceModelHeader { + id, + metas, + manifest, + path, + }, + }) + } + + /// ID。 + pub fn id(&self) -> &VoiceModelId { + &self.header.id + } + + /// メタ情報。 + pub fn metas(&self) -> &VoiceModelMeta { + &self.header.metas + } + + pub(crate) fn header(&self) -> &VoiceModelHeader { + &self.header + } + } + + #[self_referencing] + struct BlockingVvmEntryReader { + path: std::path::PathBuf, + zip: Vec, + #[covariant] + #[borrows(zip)] + reader: zip::ZipArchive>, + } + + impl BlockingVvmEntryReader { + fn open(path: &Path) -> LoadModelResult { + (|| { + let zip = std::fs::read(path)?; + Self::try_new(path.to_owned(), zip, |zip| { + zip::ZipArchive::new(Cursor::new(zip)) + }) + })() + .map_err(|source| LoadModelError { + path: path.to_owned(), + context: LoadModelErrorKind::OpenZipFile, + source: Some(source.into()), + }) + } + + fn read_vvm_json(&self, filename: &str) -> LoadModelResult { + let bytes = &self.read_vvm_entry(filename)?; + serde_json::from_slice(bytes).map_err(|source| LoadModelError { + path: self.borrow_path().clone(), + context: LoadModelErrorKind::OpenZipFile, + source: Some(source.into()), + }) + } + + fn read_vvm_entry(&self, filename: &str) -> LoadModelResult> { + (|| { + let mut reader = self.borrow_reader().clone(); + let mut entry = reader.by_name(filename)?; + let mut buf = Vec::with_capacity(entry.size() as _); + io::copy(&mut entry, &mut buf)?; + Ok(buf) + })() + .map_err(|source| LoadModelError { + path: self.borrow_path().clone(), + context: LoadModelErrorKind::OpenZipFile, + source: Some(source), + }) + } } } pub(crate) mod tokio { - use super::VoiceModelHeader; + use std::{collections::HashMap, io, path::Path}; + + use derive_new::new; + use enum_map::EnumMap; + use futures::future::join3; + use nanoid::nanoid; + use serde::de::DeserializeOwned; + + use crate::{ + error::{LoadModelError, LoadModelErrorKind, LoadModelResult}, + infer::domain::InferenceOperationImpl, + manifest::Manifest, + Result, VoiceModelMeta, + }; + + use super::{VoiceModelHeader, VoiceModelId}; /// 音声モデル。 /// /// VVMファイルと対応する。 #[derive(Clone)] pub struct VoiceModel { - pub(super) header: VoiceModelHeader, + header: VoiceModelHeader, + } + + impl self::VoiceModel { + pub(crate) async fn read_inference_models( + &self, + ) -> LoadModelResult>> { + let reader = AsyncVvmEntryReader::open(&self.header.path).await?; + let ( + decode_model_result, + predict_duration_model_result, + predict_intonation_model_result, + ) = join3( + reader.read_vvm_entry(self.header.manifest.decode_filename()), + reader.read_vvm_entry(self.header.manifest.predict_duration_filename()), + reader.read_vvm_entry(self.header.manifest.predict_intonation_filename()), + ) + .await; + + Ok(EnumMap::from_array([ + predict_duration_model_result?, + predict_intonation_model_result?, + decode_model_result?, + ])) + } + /// VVMファイルから`VoiceModel`をコンストラクトする。 + pub async fn from_path(path: impl AsRef) -> Result { + let reader = AsyncVvmEntryReader::open(path.as_ref()).await?; + let manifest = reader.read_vvm_json::("manifest.json").await?; + let metas = reader + .read_vvm_json::(manifest.metas_filename()) + .await?; + let id = VoiceModelId::new(nanoid!()); + + Ok(Self { + header: VoiceModelHeader { + id, + metas, + manifest, + path: path.as_ref().into(), + }, + }) + } + + /// ID。 + pub fn id(&self) -> &VoiceModelId { + &self.header.id + } + + /// メタ情報。 + pub fn metas(&self) -> &VoiceModelMeta { + &self.header.metas + } + + pub(crate) fn header(&self) -> &VoiceModelHeader { + &self.header + } + } + + struct AsyncVvmEntry { + index: usize, + entry: async_zip::ZipEntry, + } + + #[derive(new)] + struct AsyncVvmEntryReader { + reader: async_zip::read::fs::ZipFileReader, + entry_map: HashMap, + } + + impl AsyncVvmEntryReader { + async fn open(path: &Path) -> LoadModelResult { + let reader = async_zip::read::fs::ZipFileReader::new(path) + .await + .map_err(|source| LoadModelError { + path: path.to_owned(), + context: LoadModelErrorKind::OpenZipFile, + source: Some(source.into()), + })?; + let entry_map: HashMap<_, _> = reader + .file() + .entries() + .iter() + .filter(|e| !e.entry().dir()) + .enumerate() + .map(|(i, e)| { + ( + e.entry().filename().to_string(), + AsyncVvmEntry { + index: i, + entry: e.entry().clone(), + }, + ) + }) + .collect(); + Ok(AsyncVvmEntryReader::new(reader, entry_map)) + } + async fn read_vvm_json(&self, filename: &str) -> LoadModelResult { + let bytes = self.read_vvm_entry(filename).await?; + serde_json::from_slice(&bytes).map_err(|source| LoadModelError { + path: self.reader.path().to_owned(), + context: LoadModelErrorKind::ReadZipEntry { + filename: filename.to_owned(), + }, + source: Some(source.into()), + }) + } + + async fn read_vvm_entry(&self, filename: &str) -> LoadModelResult> { + async { + let me = self + .entry_map + .get(filename) + .ok_or_else(|| io::Error::from(io::ErrorKind::NotFound))?; + let mut manifest_reader = self.reader.entry(me.index).await?; + let mut buf = Vec::with_capacity(me.entry.uncompressed_size() as usize); + manifest_reader + .read_to_end_checked(&mut buf, &me.entry) + .await?; + Ok::<_, anyhow::Error>(buf) + } + .await + .map_err(|source| LoadModelError { + path: self.reader.path().to_owned(), + context: LoadModelErrorKind::ReadZipEntry { + filename: filename.to_owned(), + }, + source: Some(source), + }) + } } } diff --git a/crates/voicevox_core_c_api/tests/e2e/log_mask.rs b/crates/voicevox_core_c_api/tests/e2e/log_mask.rs index ce364c1be..b28442bdb 100644 --- a/crates/voicevox_core_c_api/tests/e2e/log_mask.rs +++ b/crates/voicevox_core_c_api/tests/e2e/log_mask.rs @@ -23,7 +23,7 @@ impl Utf8Output { pub(crate) fn mask_windows_video_cards(self) -> Self { self.mask_stderr( static_regex!( - r#"(?m)^\{timestamp\} INFO voicevox_core::synthesizer: 検出されたGPU \(DirectMLには1番目のGPUが使われます\):(\n\{timestamp\} INFO voicevox_core::synthesizer: - "[^"]+" \([0-9.]+ [a-zA-Z]+\))+"#, + r#"(?m)^\{timestamp\} INFO voicevox_core::synthesizer::blocking: 検出されたGPU \(DirectMLには1番目のGPUが使われます\):(\n\{timestamp\} INFO voicevox_core::synthesizer::blocking: - "[^"]+" \([0-9.]+ [a-zA-Z]+\))+"#, ), "{windows-video-cards}", )