From 203410c26e867d6f4f2efa85a5d6086d3612d588 Mon Sep 17 00:00:00 2001
From: Ryo Yamashita <qryxip@gmail.com>
Date: Fri, 20 Dec 2024 22:10:19 +0900
Subject: [PATCH] =?UTF-8?q?refactor:=20`mod=20inner`=E3=82=92=E5=89=8A?=
 =?UTF-8?q?=E9=99=A4=20(#897)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#865 で入れた`mod inner`を消す。

Refs: https://github.com/VOICEVOX/voicevox_core/pull/865/files#r1823086190
---
 crates/voicevox_core/src/synthesizer.rs       | 2114 ++++++++---------
 .../voicevox_core_c_api/tests/e2e/log_mask.rs |    2 +-
 .../tests/e2e/snapshots.toml                  |   20 +-
 3 files changed, 1053 insertions(+), 1083 deletions(-)
diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs
index 7437a3ca5..e9240de86 100644
--- a/crates/voicevox_core/src/synthesizer.rs
+++ b/crates/voicevox_core/src/synthesizer.rs
@@ -1,9 +1,36 @@
-use crate::{
-    asyncs::{BlockingThreadPool, SingleTasked},
-    infer,
+use easy_ext::ext;
+use enum_map::enum_map;
+use std::{
+    io::{Cursor, Write as _},
+    marker::PhantomData,
+    ops::Range,
+    sync::Arc,
 };
+use tracing::info;
 
-pub use self::inner::MARGIN;
+use crate::{
+    asyncs::{Async, BlockingThreadPool, SingleTasked},
+    devices::{DeviceSpec, GpuSpec},
+    engine::{create_kana, mora_to_text, wav_from_s16le, Mora, OjtPhoneme},
+    error::ErrorRepr,
+    infer::{
+        self,
+        domains::{
+            FrameDecodeDomain, FrameDecodeOperation, GenerateFullIntermediateInput,
+            GenerateFullIntermediateOutput, InferenceDomainMap, PredictDurationInput,
+            PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
+            PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput, PredictSingF0Input,
+            PredictSingF0Output, PredictSingVolumeInput, PredictSingVolumeOutput,
+            RenderAudioSegmentInput, RenderAudioSegmentOutput, SfDecodeInput, SfDecodeOutput,
+            SingingTeacherDomain, SingingTeacherOperation, TalkDomain, TalkOperation,
+        },
+        InferenceRuntime, InferenceSessionOptions,
+    },
+    status::Status,
+    text_analyzer::{KanaAnalyzer, OpenJTalkAnalyzer, TextAnalyzer},
+    voice_model, AccentPhrase, AudioQuery, FullcontextExtractor, Result, StyleId, VoiceModelId,
+    VoiceModelMeta,
+};
 
 /// [`blocking::Synthesizer::synthesis`]および[`nonblocking::Synthesizer::synthesis`]のオプション。
 ///
@@ -100,1211 +127,1154 @@ impl AsyncExt for BlockingThreadPool {
     }
 }
 
-mod inner {
-    use easy_ext::ext;
-    use enum_map::enum_map;
-    use std::{
-        io::{Cursor, Write as _},
-        marker::PhantomData,
-        ops::Range,
-        sync::Arc,
-    };
-    use tracing::info;
+const DEFAULT_SAMPLING_RATE: u32 = 24000;
+/// 音が途切れてしまうのを避けるworkaround処理のためのパディング幅（フレーム数）
+const PADDING_FRAME_LENGTH: usize = 38; // (0.4秒 * 24000Hz / 256.0).round()
+/// 音声生成の際、音声特徴量の前後に確保すべきマージン幅（フレーム数）
+/// モデルの受容野から計算される
+pub const MARGIN: usize = 14;
+/// 指定した音声区間に対応する特徴量を両端にマージンを追加した上で切り出す
+fn crop_with_margin(audio: &AudioFeature, range: Range<usize>) -> ndarray::ArrayView2<'_, f32> {
+    if range.start > audio.frame_length || range.end > audio.frame_length {
+        panic!(
+            "{range:?} is out of range for audio feature of length {frame_length}",
+            frame_length = audio.frame_length,
+        );
+    }
+    if range.start > range.end {
+        panic!("{range:?} is invalid because start > end",);
+    }
+    let range = range.start..range.end + 2 * MARGIN;
+    audio.internal_state.slice(ndarray::s![range, ..])
+}
+/// 追加した安全マージンを生成音声から取り除く
+fn trim_margin_from_wave(wave_with_margin: ndarray::Array1<f32>) -> ndarray::Array1<f32> {
+    let len = wave_with_margin.len();
+    wave_with_margin.slice_move(ndarray::s![MARGIN * 256..len - MARGIN * 256])
+}
 
-    use crate::{
-        asyncs::{Async, BlockingThreadPool, SingleTasked},
-        devices::{DeviceSpec, GpuSpec},
-        engine::{create_kana, mora_to_text, wav_from_s16le, Mora, OjtPhoneme},
-        error::ErrorRepr,
-        infer::{
-            self,
-            domains::{
-                FrameDecodeDomain, FrameDecodeOperation, GenerateFullIntermediateInput,
-                GenerateFullIntermediateOutput, InferenceDomainMap, PredictDurationInput,
-                PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
-                PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput,
-                PredictSingF0Input, PredictSingF0Output, PredictSingVolumeInput,
-                PredictSingVolumeOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput,
-                SfDecodeInput, SfDecodeOutput, SingingTeacherDomain, SingingTeacherOperation,
-                TalkDomain, TalkOperation,
-            },
-            InferenceRuntime, InferenceSessionOptions,
-        },
-        status::Status,
-        text_analyzer::{KanaAnalyzer, OpenJTalkAnalyzer, TextAnalyzer},
-        voice_model, AccentPhrase, AudioQuery, FullcontextExtractor, Result, StyleId,
-        SynthesisOptions, VoiceModelId, VoiceModelMeta,
-    };
+/// 音声の中間表現。
+pub struct AudioFeature {
+    /// (フレーム数, 特徴数)の形を持つ音声特徴量。
+    internal_state: ndarray::Array2<f32>,
+    /// 生成時に指定したスタイル番号。
+    style_id: crate::StyleId,
+    /// workaround paddingを除いた音声特徴量のフレーム数。
+    pub frame_length: usize,
+    /// フレームレート。全体の秒数は`frame_length / frame_rate`で表せる。
+    pub frame_rate: f64,
+    /// 生成時に利用したクエリ。
+    audio_query: AudioQuery,
+}
 
-    use super::{AccelerationMode, AsyncExt, InitializeOptions, TtsOptions};
-
-    const DEFAULT_SAMPLING_RATE: u32 = 24000;
-    /// 音が途切れてしまうのを避けるworkaround処理のためのパディング幅（フレーム数）
-    const PADDING_FRAME_LENGTH: usize = 38; // (0.4秒 * 24000Hz / 256.0).round()
-    /// 音声生成の際、音声特徴量の前後に確保すべきマージン幅（フレーム数）
-    /// モデルの受容野から計算される
-    pub const MARGIN: usize = 14;
-    /// 指定した音声区間に対応する特徴量を両端にマージンを追加した上で切り出す
-    fn crop_with_margin(audio: &AudioFeature, range: Range<usize>) -> ndarray::ArrayView2<'_, f32> {
-        if range.start > audio.frame_length || range.end > audio.frame_length {
-            panic!(
-                "{range:?} is out of range for audio feature of length {frame_length}",
-                frame_length = audio.frame_length,
-            );
-        }
-        if range.start > range.end {
-            panic!("{range:?} is invalid because start > end",);
-        }
-        let range = range.start..range.end + 2 * MARGIN;
-        audio.internal_state.slice(ndarray::s![range, ..])
-    }
-    /// 追加した安全マージンを生成音声から取り除く
-    fn trim_margin_from_wave(wave_with_margin: ndarray::Array1<f32>) -> ndarray::Array1<f32> {
-        let len = wave_with_margin.len();
-        wave_with_margin.slice_move(ndarray::s![MARGIN * 256..len - MARGIN * 256])
-    }
-
-    /// 音声の中間表現。
-    pub struct AudioFeature {
-        /// (フレーム数, 特徴数)の形を持つ音声特徴量。
-        internal_state: ndarray::Array2<f32>,
-        /// 生成時に指定したスタイル番号。
-        style_id: crate::StyleId,
-        /// workaround paddingを除いた音声特徴量のフレーム数。
-        pub frame_length: usize,
-        /// フレームレート。全体の秒数は`frame_length / frame_rate`で表せる。
-        pub frame_rate: f64,
-        /// 生成時に利用したクエリ。
-        audio_query: AudioQuery,
-    }
-
-    pub struct Inner<O, A: Async> {
-        pub(super) status: Arc<Status<crate::blocking::Onnxruntime>>,
-        open_jtalk_analyzer: OpenJTalkAnalyzer<O>,
-        kana_analyzer: KanaAnalyzer,
-        use_gpu: bool,
-        _marker: PhantomData<fn(A) -> A>,
-    }
-
-    impl<O> From<Inner<O, BlockingThreadPool>> for Inner<O, SingleTasked> {
-        fn from(from: Inner<O, BlockingThreadPool>) -> Self {
-            Self {
-                status: from.status,
-                open_jtalk_analyzer: from.open_jtalk_analyzer,
-                kana_analyzer: KanaAnalyzer,
-                use_gpu: from.use_gpu,
-                _marker: PhantomData,
-            }
+struct Inner<O, A: Async> {
+    status: Arc<Status<crate::blocking::Onnxruntime>>,
+    open_jtalk_analyzer: OpenJTalkAnalyzer<O>,
+    kana_analyzer: KanaAnalyzer,
+    use_gpu: bool,
+    _marker: PhantomData<fn(A) -> A>,
+}
+
+impl<O> From<Inner<O, BlockingThreadPool>> for Inner<O, SingleTasked> {
+    fn from(from: Inner<O, BlockingThreadPool>) -> Self {
+        Self {
+            status: from.status,
+            open_jtalk_analyzer: from.open_jtalk_analyzer,
+            kana_analyzer: KanaAnalyzer,
+            use_gpu: from.use_gpu,
+            _marker: PhantomData,
         }
     }
+}
 
-    impl<O, A: AsyncExt> Inner<O, A> {
-        pub(super) fn new(
-            onnxruntime: &'static crate::blocking::Onnxruntime,
-            open_jtalk: O,
-            options: &InitializeOptions,
-        ) -> Result<Self> {
-            #[cfg(windows)]
-            list_windows_video_cards();
-
-            let test_gpus = || {
-                info!("GPUをテストします:");
-                let availabilities = crate::devices::test_gpus(
-                    GpuSpec::defaults(),
-                    crate::blocking::Onnxruntime::DISPLAY_NAME,
-                    onnxruntime.supported_devices()?,
-                    |gpu| onnxruntime.test_gpu(gpu),
-                );
-                for line in availabilities.to_string().lines() {
-                    info!("  {line}");
-                }
-                crate::Result::Ok(availabilities)
-            };
+impl<O, A: AsyncExt> Inner<O, A> {
+    fn new(
+        onnxruntime: &'static crate::blocking::Onnxruntime,
+        open_jtalk: O,
+        options: &InitializeOptions,
+    ) -> Result<Self> {
+        #[cfg(windows)]
+        list_windows_video_cards();
+
+        let test_gpus = || {
+            info!("GPUをテストします:");
+            let availabilities = crate::devices::test_gpus(
+                GpuSpec::defaults(),
+                crate::blocking::Onnxruntime::DISPLAY_NAME,
+                onnxruntime.supported_devices()?,
+                |gpu| onnxruntime.test_gpu(gpu),
+            );
+            for line in availabilities.to_string().lines() {
+                info!("  {line}");
+            }
+            crate::Result::Ok(availabilities)
+        };
 
-            let device_for_heavy = match options.acceleration_mode {
-                AccelerationMode::Auto => match *test_gpus()?.oks() {
-                    [] => DeviceSpec::Cpu,
+        let device_for_heavy = match options.acceleration_mode {
+            AccelerationMode::Auto => match *test_gpus()?.oks() {
+                [] => DeviceSpec::Cpu,
+                [gpu, ..] => DeviceSpec::Gpu(gpu),
+            },
+            AccelerationMode::Cpu => DeviceSpec::Cpu,
+            AccelerationMode::Gpu => {
+                let availabilities = test_gpus()?;
+                match *availabilities.oks() {
+                    [] => return Err(ErrorRepr::GpuSupport(availabilities).into()),
                     [gpu, ..] => DeviceSpec::Gpu(gpu),
-                },
-                AccelerationMode::Cpu => DeviceSpec::Cpu,
-                AccelerationMode::Gpu => {
-                    let availabilities = test_gpus()?;
-                    match *availabilities.oks() {
-                        [] => return Err(ErrorRepr::GpuSupport(availabilities).into()),
-                        [gpu, ..] => DeviceSpec::Gpu(gpu),
-                    }
                 }
-            };
-
-            info!("{device_for_heavy}を利用します");
-
-            // 軽いモデルはこちらを使う
-            let light_session_options =
-                InferenceSessionOptions::new(options.cpu_num_threads, DeviceSpec::Cpu);
-
-            // 重いモデルはこちらを使う
-            let heavy_session_options =
-                InferenceSessionOptions::new(options.cpu_num_threads, device_for_heavy);
-
-            let status = Status::new(
-                onnxruntime,
-                InferenceDomainMap {
-                    talk: enum_map! {
-                        TalkOperation::PredictDuration
-                        | TalkOperation::PredictIntonation
-                        | TalkOperation::GenerateFullIntermediate => light_session_options,
-                        TalkOperation::RenderAudioSegment => heavy_session_options,
-                    },
-                    singing_teacher: enum_map! {
-                        SingingTeacherOperation::PredictSingConsonantLength
-                        | SingingTeacherOperation::PredictSingF0
-                        | SingingTeacherOperation::PredictSingVolume => light_session_options,
-                    },
-                    frame_decode: enum_map! {
-                        FrameDecodeOperation::SfDecode => heavy_session_options,
-                    },
+            }
+        };
+
+        info!("{device_for_heavy}を利用します");
+
+        // 軽いモデルはこちらを使う
+        let light_session_options =
+            InferenceSessionOptions::new(options.cpu_num_threads, DeviceSpec::Cpu);
+
+        // 重いモデルはこちらを使う
+        let heavy_session_options =
+            InferenceSessionOptions::new(options.cpu_num_threads, device_for_heavy);
+
+        let status = Status::new(
+            onnxruntime,
+            InferenceDomainMap {
+                talk: enum_map! {
+                    TalkOperation::PredictDuration
+                    | TalkOperation::PredictIntonation
+                    | TalkOperation::GenerateFullIntermediate => light_session_options,
+                    TalkOperation::RenderAudioSegment => heavy_session_options,
                 },
-            )
-            .into();
+                singing_teacher: enum_map! {
+                    SingingTeacherOperation::PredictSingConsonantLength
+                    | SingingTeacherOperation::PredictSingF0
+                    | SingingTeacherOperation::PredictSingVolume => light_session_options,
+                },
+                frame_decode: enum_map! {
+                    FrameDecodeOperation::SfDecode => heavy_session_options,
+                },
+            },
+        )
+        .into();
 
-            let use_gpu = matches!(device_for_heavy, DeviceSpec::Gpu(_));
+        let use_gpu = matches!(device_for_heavy, DeviceSpec::Gpu(_));
 
-            Ok(Self {
-                status,
-                open_jtalk_analyzer: OpenJTalkAnalyzer::new(open_jtalk),
-                kana_analyzer: KanaAnalyzer,
-                use_gpu,
-                _marker: PhantomData,
-            })
-        }
+        Ok(Self {
+            status,
+            open_jtalk_analyzer: OpenJTalkAnalyzer::new(open_jtalk),
+            kana_analyzer: KanaAnalyzer,
+            use_gpu,
+            _marker: PhantomData,
+        })
+    }
 
-        pub(super) fn onnxruntime(&self) -> &'static crate::blocking::Onnxruntime {
-            self.status.rt
-        }
+    fn onnxruntime(&self) -> &'static crate::blocking::Onnxruntime {
+        self.status.rt
+    }
 
-        pub(super) fn is_gpu_mode(&self) -> bool {
-            self.use_gpu
-        }
+    fn is_gpu_mode(&self) -> bool {
+        self.use_gpu
+    }
 
-        pub(super) async fn load_voice_model(
-            &self,
-            model: &voice_model::Inner<A>,
-        ) -> crate::Result<()> {
-            let model_bytes = model.read_inference_models().await?;
+    async fn load_voice_model(&self, model: &voice_model::Inner<A>) -> crate::Result<()> {
+        let model_bytes = model.read_inference_models().await?;
 
-            let status = self.status.clone();
-            let header = model.header().clone();
-            A::unblock(move || status.insert_model(&header, &model_bytes)).await
-        }
+        let status = self.status.clone();
+        let header = model.header().clone();
+        A::unblock(move || status.insert_model(&header, &model_bytes)).await
+    }
 
-        pub(super) fn unload_voice_model(&self, voice_model_id: VoiceModelId) -> Result<()> {
-            self.status.unload_model(voice_model_id)
-        }
+    fn unload_voice_model(&self, voice_model_id: VoiceModelId) -> Result<()> {
+        self.status.unload_model(voice_model_id)
+    }
 
-        pub(super) fn is_loaded_voice_model(&self, voice_model_id: VoiceModelId) -> bool {
-            self.status.is_loaded_model(voice_model_id)
-        }
+    fn is_loaded_voice_model(&self, voice_model_id: VoiceModelId) -> bool {
+        self.status.is_loaded_model(voice_model_id)
+    }
 
-        pub(super) fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool {
-            self.status.is_loaded_model_by_style_id(style_id)
-        }
+    fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool {
+        self.status.is_loaded_model_by_style_id(style_id)
+    }
 
-        pub(super) fn metas(&self) -> VoiceModelMeta {
-            self.status.metas()
-        }
+    fn metas(&self) -> VoiceModelMeta {
+        self.status.metas()
+    }
 
-        pub(super) async fn precompute_render(
-            &self,
-            audio_query: &AudioQuery,
-            style_id: StyleId,
-            options: &SynthesisOptions,
-        ) -> Result<AudioFeature> {
-            let AudioQuery {
-                accent_phrases,
-                speed_scale,
-                pitch_scale,
-                intonation_scale,
-                pre_phoneme_length,
-                post_phoneme_length,
-                ..
-            } = audio_query;
+    async fn precompute_render(
+        &self,
+        audio_query: &AudioQuery,
+        style_id: StyleId,
+        options: &SynthesisOptions,
+    ) -> Result<AudioFeature> {
+        let AudioQuery {
+            accent_phrases,
+            speed_scale,
+            pitch_scale,
+            intonation_scale,
+            pre_phoneme_length,
+            post_phoneme_length,
+            ..
+        } = audio_query;
+
+        let accent_phrases = if options.enable_interrogative_upspeak {
+            &adjust_interrogative_accent_phrases(accent_phrases)
+        } else {
+            accent_phrases
+        };
 
-            let accent_phrases = if options.enable_interrogative_upspeak {
-                &adjust_interrogative_accent_phrases(accent_phrases)
-            } else {
-                accent_phrases
-            };
+        let (flatten_moras, phoneme_data_list) = initial_process(accent_phrases);
 
-            let (flatten_moras, phoneme_data_list) = initial_process(accent_phrases);
+        let mut phoneme_length_list = vec![*pre_phoneme_length];
+        let mut f0_list = vec![0.];
+        let mut voiced_list = vec![false];
+        {
+            let mut sum_of_f0_bigger_than_zero = 0.;
+            let mut count_of_f0_bigger_than_zero = 0;
 
-            let mut phoneme_length_list = vec![*pre_phoneme_length];
-            let mut f0_list = vec![0.];
-            let mut voiced_list = vec![false];
+            for Mora {
+                consonant_length,
+                vowel_length,
+                pitch,
+                ..
+            } in flatten_moras
             {
-                let mut sum_of_f0_bigger_than_zero = 0.;
-                let mut count_of_f0_bigger_than_zero = 0;
-
-                for Mora {
-                    consonant_length,
-                    vowel_length,
-                    pitch,
-                    ..
-                } in flatten_moras
-                {
-                    if let Some(consonant_length) = consonant_length {
-                        phoneme_length_list.push(consonant_length);
-                    }
-                    phoneme_length_list.push(vowel_length);
+                if let Some(consonant_length) = consonant_length {
+                    phoneme_length_list.push(consonant_length);
+                }
+                phoneme_length_list.push(vowel_length);
 
-                    let f0_single = pitch * 2.0_f32.powf(*pitch_scale);
-                    f0_list.push(f0_single);
+                let f0_single = pitch * 2.0_f32.powf(*pitch_scale);
+                f0_list.push(f0_single);
 
-                    let bigger_than_zero = f0_single > 0.;
-                    voiced_list.push(bigger_than_zero);
+                let bigger_than_zero = f0_single > 0.;
+                voiced_list.push(bigger_than_zero);
 
-                    if bigger_than_zero {
-                        sum_of_f0_bigger_than_zero += f0_single;
-                        count_of_f0_bigger_than_zero += 1;
-                    }
+                if bigger_than_zero {
+                    sum_of_f0_bigger_than_zero += f0_single;
+                    count_of_f0_bigger_than_zero += 1;
                 }
-                phoneme_length_list.push(*post_phoneme_length);
-                f0_list.push(0.);
-                voiced_list.push(false);
-                let mean_f0 = sum_of_f0_bigger_than_zero / (count_of_f0_bigger_than_zero as f32);
-
-                if !mean_f0.is_nan() {
-                    for i in 0..f0_list.len() {
-                        if voiced_list[i] {
-                            f0_list[i] = (f0_list[i] - mean_f0) * intonation_scale + mean_f0;
-                        }
+            }
+            phoneme_length_list.push(*post_phoneme_length);
+            f0_list.push(0.);
+            voiced_list.push(false);
+            let mean_f0 = sum_of_f0_bigger_than_zero / (count_of_f0_bigger_than_zero as f32);
+
+            if !mean_f0.is_nan() {
+                for i in 0..f0_list.len() {
+                    if voiced_list[i] {
+                        f0_list[i] = (f0_list[i] - mean_f0) * intonation_scale + mean_f0;
                     }
                 }
             }
+        }
 
-            let (_, _, vowel_indexes) = split_mora(&phoneme_data_list);
+        let (_, _, vowel_indexes) = split_mora(&phoneme_data_list);
+
+        let mut phoneme = Vec::new();
+        let mut f0: Vec<f32> = Vec::new();
+        {
+            const RATE: f32 = 24000. / 256.;
+            let mut sum_of_phoneme_length = 0;
+            let mut count_of_f0 = 0;
+            let mut vowel_indexes_index = 0;
+
+            for (i, phoneme_length) in phoneme_length_list.iter().enumerate() {
+                // VOICEVOX ENGINEと挙動を合わせるため、四捨五入ではなく偶数丸めをする
+                //
+                // https://github.com/VOICEVOX/voicevox_engine/issues/552
+                let phoneme_length = ((*phoneme_length * RATE).round_ties_even() / speed_scale)
+                    .round_ties_even() as usize;
+                let phoneme_id = phoneme_data_list[i].phoneme_id();
+
+                for _ in 0..phoneme_length {
+                    let mut phonemes_vec = [0.; OjtPhoneme::num_phoneme()];
+                    phonemes_vec[phoneme_id as usize] = 1.;
+                    phoneme.push(phonemes_vec)
+                }
+                sum_of_phoneme_length += phoneme_length;
 
-            let mut phoneme = Vec::new();
-            let mut f0: Vec<f32> = Vec::new();
-            {
-                const RATE: f32 = 24000. / 256.;
-                let mut sum_of_phoneme_length = 0;
-                let mut count_of_f0 = 0;
-                let mut vowel_indexes_index = 0;
-
-                for (i, phoneme_length) in phoneme_length_list.iter().enumerate() {
-                    // VOICEVOX ENGINEと挙動を合わせるため、四捨五入ではなく偶数丸めをする
-                    //
-                    // https://github.com/VOICEVOX/voicevox_engine/issues/552
-                    let phoneme_length = ((*phoneme_length * RATE).round_ties_even() / speed_scale)
-                        .round_ties_even() as usize;
-                    let phoneme_id = phoneme_data_list[i].phoneme_id();
-
-                    for _ in 0..phoneme_length {
-                        let mut phonemes_vec = [0.; OjtPhoneme::num_phoneme()];
-                        phonemes_vec[phoneme_id as usize] = 1.;
-                        phoneme.push(phonemes_vec)
-                    }
-                    sum_of_phoneme_length += phoneme_length;
-
-                    if i as i64 == vowel_indexes[vowel_indexes_index] {
-                        for _ in 0..sum_of_phoneme_length {
-                            f0.push(f0_list[count_of_f0]);
-                        }
-                        count_of_f0 += 1;
-                        sum_of_phoneme_length = 0;
-                        vowel_indexes_index += 1;
+                if i as i64 == vowel_indexes[vowel_indexes_index] {
+                    for _ in 0..sum_of_phoneme_length {
+                        f0.push(f0_list[count_of_f0]);
                     }
+                    count_of_f0 += 1;
+                    sum_of_phoneme_length = 0;
+                    vowel_indexes_index += 1;
                 }
             }
+        }
 
-            let spec = self
-                .generate_full_intermediate(
-                    f0.len(),
-                    OjtPhoneme::num_phoneme(),
-                    &f0,
-                    phoneme.as_flattened(),
-                    style_id,
-                )
-                .await?;
-            return Ok(AudioFeature {
-                internal_state: spec,
+        let spec = self
+            .generate_full_intermediate(
+                f0.len(),
+                OjtPhoneme::num_phoneme(),
+                &f0,
+                phoneme.as_flattened(),
                 style_id,
-                frame_length: f0.len(),
-                frame_rate: (DEFAULT_SAMPLING_RATE as f64) / 256.0,
-                audio_query: audio_query.clone(),
-            });
-
-            fn adjust_interrogative_accent_phrases(
-                accent_phrases: &[AccentPhrase],
-            ) -> Vec<AccentPhrase> {
-                accent_phrases
-                    .iter()
-                    .map(|accent_phrase| AccentPhrase {
-                        moras: adjust_interrogative_moras(accent_phrase),
-                        ..accent_phrase.clone()
-                    })
-                    .collect()
-            }
+            )
+            .await?;
+        return Ok(AudioFeature {
+            internal_state: spec,
+            style_id,
+            frame_length: f0.len(),
+            frame_rate: (DEFAULT_SAMPLING_RATE as f64) / 256.0,
+            audio_query: audio_query.clone(),
+        });
+
+        fn adjust_interrogative_accent_phrases(
+            accent_phrases: &[AccentPhrase],
+        ) -> Vec<AccentPhrase> {
+            accent_phrases
+                .iter()
+                .map(|accent_phrase| AccentPhrase {
+                    moras: adjust_interrogative_moras(accent_phrase),
+                    ..accent_phrase.clone()
+                })
+                .collect()
+        }
 
-            fn adjust_interrogative_moras(
-                AccentPhrase {
-                    moras,
-                    is_interrogative,
-                    ..
-                }: &AccentPhrase,
-            ) -> Vec<Mora> {
-                if *is_interrogative && !moras.is_empty() {
-                    let last_mora = moras.last().unwrap();
-                    if last_mora.pitch != 0.0 {
-                        let mut new_moras: Vec<Mora> = Vec::with_capacity(moras.len() + 1);
-                        new_moras.extend_from_slice(moras.as_slice());
-                        let interrogative_mora = make_interrogative_mora(last_mora);
-                        new_moras.push(interrogative_mora);
-                        return new_moras;
-                    }
+        fn adjust_interrogative_moras(
+            AccentPhrase {
+                moras,
+                is_interrogative,
+                ..
+            }: &AccentPhrase,
+        ) -> Vec<Mora> {
+            if *is_interrogative && !moras.is_empty() {
+                let last_mora = moras.last().unwrap();
+                if last_mora.pitch != 0.0 {
+                    let mut new_moras: Vec<Mora> = Vec::with_capacity(moras.len() + 1);
+                    new_moras.extend_from_slice(moras.as_slice());
+                    let interrogative_mora = make_interrogative_mora(last_mora);
+                    new_moras.push(interrogative_mora);
+                    return new_moras;
                 }
-                moras.clone()
             }
+            moras.clone()
+        }
 
-            fn make_interrogative_mora(last_mora: &Mora) -> Mora {
-                const FIX_VOWEL_LENGTH: f32 = 0.15;
-                const ADJUST_PITCH: f32 = 0.3;
-                const MAX_PITCH: f32 = 6.5;
+        fn make_interrogative_mora(last_mora: &Mora) -> Mora {
+            const FIX_VOWEL_LENGTH: f32 = 0.15;
+            const ADJUST_PITCH: f32 = 0.3;
+            const MAX_PITCH: f32 = 6.5;
 
-                let pitch = (last_mora.pitch + ADJUST_PITCH).min(MAX_PITCH);
+            let pitch = (last_mora.pitch + ADJUST_PITCH).min(MAX_PITCH);
 
-                Mora {
-                    text: mora_to_text(None, &last_mora.vowel),
-                    consonant: None,
-                    consonant_length: None,
-                    vowel: last_mora.vowel.clone(),
-                    vowel_length: FIX_VOWEL_LENGTH,
-                    pitch,
-                }
+            Mora {
+                text: mora_to_text(None, &last_mora.vowel),
+                consonant: None,
+                consonant_length: None,
+                vowel: last_mora.vowel.clone(),
+                vowel_length: FIX_VOWEL_LENGTH,
+                pitch,
             }
         }
+    }
 
-        pub(super) async fn render(
-            &self,
-            audio: &AudioFeature,
-            range: Range<usize>,
-        ) -> Result<Vec<u8>> {
-            // TODO: 44.1kHzなどの対応
-            if range.is_empty() {
-                // FIXME: `start>end`に対してパニックせずに正常に空を返してしまうのでは？
-                // 指定区間が空のときは早期リターン
-                return Ok(vec![]);
-            }
-            let spec_segment = crop_with_margin(audio, range);
-            let wave_with_margin = self
-                .render_audio_segment(spec_segment.to_owned(), audio.style_id)
-                .await?;
-            let wave = trim_margin_from_wave(wave_with_margin);
-            return Ok(to_s16le_pcm(
-                wave.as_slice()
-                    .expect("`trim_margin_from_wave` should just trim an array"),
-                &audio.audio_query,
-            ));
-
-            fn to_s16le_pcm(
-                wave: &[f32],
-                &AudioQuery {
-                    volume_scale,
-                    output_sampling_rate,
-                    output_stereo,
-                    ..
-                }: &AudioQuery,
-            ) -> Vec<u8> {
-                let num_channels: u16 = if output_stereo { 2 } else { 1 };
-                let repeat_count: u32 =
-                    (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32;
-                let bytes_size = wave.len() as u32 * repeat_count * 2;
-                let buf: Vec<u8> = Vec::with_capacity(bytes_size as usize);
-                let mut cur = Cursor::new(buf);
-
-                for value in wave {
-                    let v = (value * volume_scale).clamp(-1., 1.);
-                    let data = (v * 0x7fff as f32) as i16;
-                    for _ in 0..repeat_count {
-                        cur.write_all(&data.to_le_bytes()).unwrap();
-                    }
+    async fn render(&self, audio: &AudioFeature, range: Range<usize>) -> Result<Vec<u8>> {
+        // TODO: 44.1kHzなどの対応
+        if range.is_empty() {
+            // FIXME: `start>end`に対してパニックせずに正常に空を返してしまうのでは？
+            // 指定区間が空のときは早期リターン
+            return Ok(vec![]);
+        }
+        let spec_segment = crop_with_margin(audio, range);
+        let wave_with_margin = self
+            .render_audio_segment(spec_segment.to_owned(), audio.style_id)
+            .await?;
+        let wave = trim_margin_from_wave(wave_with_margin);
+        return Ok(to_s16le_pcm(
+            wave.as_slice()
+                .expect("`trim_margin_from_wave` should just trim an array"),
+            &audio.audio_query,
+        ));
+
+        fn to_s16le_pcm(
+            wave: &[f32],
+            &AudioQuery {
+                volume_scale,
+                output_sampling_rate,
+                output_stereo,
+                ..
+            }: &AudioQuery,
+        ) -> Vec<u8> {
+            let num_channels: u16 = if output_stereo { 2 } else { 1 };
+            let repeat_count: u32 =
+                (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32;
+            let bytes_size = wave.len() as u32 * repeat_count * 2;
+            let buf: Vec<u8> = Vec::with_capacity(bytes_size as usize);
+            let mut cur = Cursor::new(buf);
+
+            for value in wave {
+                let v = (value * volume_scale).clamp(-1., 1.);
+                let data = (v * 0x7fff as f32) as i16;
+                for _ in 0..repeat_count {
+                    cur.write_all(&data.to_le_bytes()).unwrap();
                 }
-
-                cur.into_inner()
             }
-        }
 
-        pub(super) async fn synthesis(
-            &self,
-            audio_query: &AudioQuery,
-            style_id: StyleId,
-            options: &SynthesisOptions,
-        ) -> Result<Vec<u8>> {
-            let audio = self
-                .precompute_render(audio_query, style_id, options)
-                .await?;
-            let pcm = self.render(&audio, 0..audio.frame_length).await?;
-            Ok(wav_from_s16le(
-                &pcm,
-                audio_query.output_sampling_rate,
-                audio_query.output_stereo,
-            ))
+            cur.into_inner()
         }
+    }
 
-        pub(super) async fn create_accent_phrases_from_kana(
-            &self,
-            kana: &str,
-            style_id: StyleId,
-        ) -> Result<Vec<AccentPhrase>> {
-            let accent_phrases = self.kana_analyzer.analyze(kana)?;
-            self.replace_mora_data(&accent_phrases, style_id).await
-        }
+    async fn synthesis(
+        &self,
+        audio_query: &AudioQuery,
+        style_id: StyleId,
+        options: &SynthesisOptions,
+    ) -> Result<Vec<u8>> {
+        let audio = self
+            .precompute_render(audio_query, style_id, options)
+            .await?;
+        let pcm = self.render(&audio, 0..audio.frame_length).await?;
+        Ok(wav_from_s16le(
+            &pcm,
+            audio_query.output_sampling_rate,
+            audio_query.output_stereo,
+        ))
+    }
 
-        pub(super) async fn replace_mora_data(
-            &self,
-            accent_phrases: &[AccentPhrase],
-            style_id: StyleId,
-        ) -> Result<Vec<AccentPhrase>> {
-            let accent_phrases = self
-                .replace_phoneme_length(accent_phrases, style_id)
-                .await?;
-            self.replace_mora_pitch(&accent_phrases, style_id).await
-        }
+    async fn create_accent_phrases_from_kana(
+        &self,
+        kana: &str,
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhrase>> {
+        let accent_phrases = self.kana_analyzer.analyze(kana)?;
+        self.replace_mora_data(&accent_phrases, style_id).await
+    }
 
-        pub(super) async fn replace_phoneme_length(
-            &self,
-            accent_phrases: &[AccentPhrase],
-            style_id: StyleId,
-        ) -> Result<Vec<AccentPhrase>> {
-            let (_, phoneme_data_list) = initial_process(accent_phrases);
+    async fn replace_mora_data(
+        &self,
+        accent_phrases: &[AccentPhrase],
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhrase>> {
+        let accent_phrases = self
+            .replace_phoneme_length(accent_phrases, style_id)
+            .await?;
+        self.replace_mora_pitch(&accent_phrases, style_id).await
+    }
 
-            let (_, _, vowel_indexes_data) = split_mora(&phoneme_data_list);
+    async fn replace_phoneme_length(
+        &self,
+        accent_phrases: &[AccentPhrase],
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhrase>> {
+        let (_, phoneme_data_list) = initial_process(accent_phrases);
 
-            let phoneme_list_s: Vec<i64> = phoneme_data_list
-                .iter()
-                .map(|phoneme_data| phoneme_data.phoneme_id())
-                .collect();
-            let phoneme_length = self.predict_duration(&phoneme_list_s, style_id).await?;
+        let (_, _, vowel_indexes_data) = split_mora(&phoneme_data_list);
 
-            let mut index = 0;
-            let new_accent_phrases = accent_phrases
-                .iter()
-                .map(|accent_phrase| AccentPhrase {
-                    moras: accent_phrase
-                        .moras
-                        .iter()
-                        .map(|mora| {
-                            let new_mora = Mora {
-                                consonant_length: mora.consonant.as_ref().map(|_| {
-                                    phoneme_length[vowel_indexes_data[index + 1] as usize - 1]
-                                }),
-                                vowel_length: phoneme_length
-                                    [vowel_indexes_data[index + 1] as usize],
-                                ..mora.clone()
-                            };
-                            index += 1;
-                            new_mora
-                        })
-                        .collect(),
-                    pause_mora: accent_phrase.pause_mora.as_ref().map(|pause_mora| {
-                        let new_pause_mora = Mora {
+        let phoneme_list_s: Vec<i64> = phoneme_data_list
+            .iter()
+            .map(|phoneme_data| phoneme_data.phoneme_id())
+            .collect();
+        let phoneme_length = self.predict_duration(&phoneme_list_s, style_id).await?;
+
+        let mut index = 0;
+        let new_accent_phrases = accent_phrases
+            .iter()
+            .map(|accent_phrase| AccentPhrase {
+                moras: accent_phrase
+                    .moras
+                    .iter()
+                    .map(|mora| {
+                        let new_mora = Mora {
+                            consonant_length: mora.consonant.as_ref().map(|_| {
+                                phoneme_length[vowel_indexes_data[index + 1] as usize - 1]
+                            }),
                             vowel_length: phoneme_length[vowel_indexes_data[index + 1] as usize],
-                            ..pause_mora.clone()
+                            ..mora.clone()
                         };
                         index += 1;
-                        new_pause_mora
-                    }),
-                    ..accent_phrase.clone()
-                })
-                .collect();
+                        new_mora
+                    })
+                    .collect(),
+                pause_mora: accent_phrase.pause_mora.as_ref().map(|pause_mora| {
+                    let new_pause_mora = Mora {
+                        vowel_length: phoneme_length[vowel_indexes_data[index + 1] as usize],
+                        ..pause_mora.clone()
+                    };
+                    index += 1;
+                    new_pause_mora
+                }),
+                ..accent_phrase.clone()
+            })
+            .collect();
 
-            Ok(new_accent_phrases)
-        }
+        Ok(new_accent_phrases)
+    }
 
-        pub(super) async fn replace_mora_pitch(
-            &self,
-            accent_phrases: &[AccentPhrase],
-            style_id: StyleId,
-        ) -> Result<Vec<AccentPhrase>> {
-            let (_, phoneme_data_list) = initial_process(accent_phrases);
-
-            let mut base_start_accent_list = vec![0];
-            let mut base_end_accent_list = vec![0];
-            let mut base_start_accent_phrase_list = vec![0];
-            let mut base_end_accent_phrase_list = vec![0];
-            for accent_phrase in accent_phrases {
-                let mut accent = usize::from(accent_phrase.accent != 1);
-                create_one_accent_list(&mut base_start_accent_list, accent_phrase, accent as i32);
-
-                accent = accent_phrase.accent - 1;
-                create_one_accent_list(&mut base_end_accent_list, accent_phrase, accent as i32);
-                create_one_accent_list(&mut base_start_accent_phrase_list, accent_phrase, 0);
-                create_one_accent_list(&mut base_end_accent_phrase_list, accent_phrase, -1);
-            }
-            base_start_accent_list.push(0);
-            base_end_accent_list.push(0);
-            base_start_accent_phrase_list.push(0);
-            base_end_accent_phrase_list.push(0);
+    async fn replace_mora_pitch(
+        &self,
+        accent_phrases: &[AccentPhrase],
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhrase>> {
+        let (_, phoneme_data_list) = initial_process(accent_phrases);
+
+        let mut base_start_accent_list = vec![0];
+        let mut base_end_accent_list = vec![0];
+        let mut base_start_accent_phrase_list = vec![0];
+        let mut base_end_accent_phrase_list = vec![0];
+        for accent_phrase in accent_phrases {
+            let mut accent = usize::from(accent_phrase.accent != 1);
+            create_one_accent_list(&mut base_start_accent_list, accent_phrase, accent as i32);
+
+            accent = accent_phrase.accent - 1;
+            create_one_accent_list(&mut base_end_accent_list, accent_phrase, accent as i32);
+            create_one_accent_list(&mut base_start_accent_phrase_list, accent_phrase, 0);
+            create_one_accent_list(&mut base_end_accent_phrase_list, accent_phrase, -1);
+        }
+        base_start_accent_list.push(0);
+        base_end_accent_list.push(0);
+        base_start_accent_phrase_list.push(0);
+        base_end_accent_phrase_list.push(0);
+
+        let (consonant_phoneme_data_list, vowel_phoneme_data_list, vowel_indexes) =
+            split_mora(&phoneme_data_list);
+
+        let consonant_phoneme_list: Vec<i64> = consonant_phoneme_data_list
+            .iter()
+            .map(|phoneme_data| phoneme_data.phoneme_id())
+            .collect();
+        let vowel_phoneme_list: Vec<i64> = vowel_phoneme_data_list
+            .iter()
+            .map(|phoneme_data| phoneme_data.phoneme_id())
+            .collect();
 
-            let (consonant_phoneme_data_list, vowel_phoneme_data_list, vowel_indexes) =
-                split_mora(&phoneme_data_list);
+        let mut start_accent_list = Vec::with_capacity(vowel_indexes.len());
+        let mut end_accent_list = Vec::with_capacity(vowel_indexes.len());
+        let mut start_accent_phrase_list = Vec::with_capacity(vowel_indexes.len());
+        let mut end_accent_phrase_list = Vec::with_capacity(vowel_indexes.len());
 
-            let consonant_phoneme_list: Vec<i64> = consonant_phoneme_data_list
-                .iter()
-                .map(|phoneme_data| phoneme_data.phoneme_id())
-                .collect();
-            let vowel_phoneme_list: Vec<i64> = vowel_phoneme_data_list
-                .iter()
-                .map(|phoneme_data| phoneme_data.phoneme_id())
-                .collect();
-
-            let mut start_accent_list = Vec::with_capacity(vowel_indexes.len());
-            let mut end_accent_list = Vec::with_capacity(vowel_indexes.len());
-            let mut start_accent_phrase_list = Vec::with_capacity(vowel_indexes.len());
-            let mut end_accent_phrase_list = Vec::with_capacity(vowel_indexes.len());
-
-            for vowel_index in vowel_indexes {
-                start_accent_list.push(base_start_accent_list[vowel_index as usize]);
-                end_accent_list.push(base_end_accent_list[vowel_index as usize]);
-                start_accent_phrase_list.push(base_start_accent_phrase_list[vowel_index as usize]);
-                end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]);
-            }
+        for vowel_index in vowel_indexes {
+            start_accent_list.push(base_start_accent_list[vowel_index as usize]);
+            end_accent_list.push(base_end_accent_list[vowel_index as usize]);
+            start_accent_phrase_list.push(base_start_accent_phrase_list[vowel_index as usize]);
+            end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]);
+        }
 
-            let mut f0_list = self
-                .predict_intonation(
-                    vowel_phoneme_list.len(),
-                    &vowel_phoneme_list,
-                    &consonant_phoneme_list,
-                    &start_accent_list,
-                    &end_accent_list,
-                    &start_accent_phrase_list,
-                    &end_accent_phrase_list,
-                    style_id,
-                )
-                .await?;
+        let mut f0_list = self
+            .predict_intonation(
+                vowel_phoneme_list.len(),
+                &vowel_phoneme_list,
+                &consonant_phoneme_list,
+                &start_accent_list,
+                &end_accent_list,
+                &start_accent_phrase_list,
+                &end_accent_phrase_list,
+                style_id,
+            )
+            .await?;
 
-            for i in 0..vowel_phoneme_data_list.len() {
-                const UNVOICED_MORA_PHONEME_LIST: &[&str] = &["A", "I", "U", "E", "O", "cl", "pau"];
+        for i in 0..vowel_phoneme_data_list.len() {
+            const UNVOICED_MORA_PHONEME_LIST: &[&str] = &["A", "I", "U", "E", "O", "cl", "pau"];
 
-                if UNVOICED_MORA_PHONEME_LIST
-                    .iter()
-                    .any(|phoneme| *phoneme == vowel_phoneme_data_list[i].phoneme())
-                {
-                    f0_list[i] = 0.;
-                }
+            if UNVOICED_MORA_PHONEME_LIST
+                .iter()
+                .any(|phoneme| *phoneme == vowel_phoneme_data_list[i].phoneme())
+            {
+                f0_list[i] = 0.;
             }
+        }
 
-            let mut index = 0;
-            let new_accent_phrases = accent_phrases
-                .iter()
-                .map(|accent_phrase| AccentPhrase {
-                    moras: accent_phrase
-                        .moras
-                        .iter()
-                        .map(|mora| {
-                            let new_mora = Mora {
-                                pitch: f0_list[index + 1],
-                                ..mora.clone()
-                            };
-                            index += 1;
-                            new_mora
-                        })
-                        .collect(),
-                    pause_mora: accent_phrase.pause_mora.as_ref().map(|pause_mora| {
-                        let new_pause_mora = Mora {
+        let mut index = 0;
+        let new_accent_phrases = accent_phrases
+            .iter()
+            .map(|accent_phrase| AccentPhrase {
+                moras: accent_phrase
+                    .moras
+                    .iter()
+                    .map(|mora| {
+                        let new_mora = Mora {
                             pitch: f0_list[index + 1],
-                            ..pause_mora.clone()
+                            ..mora.clone()
                         };
                         index += 1;
-                        new_pause_mora
-                    }),
-                    ..accent_phrase.clone()
-                })
-                .collect();
-
-            return Ok(new_accent_phrases);
+                        new_mora
+                    })
+                    .collect(),
+                pause_mora: accent_phrase.pause_mora.as_ref().map(|pause_mora| {
+                    let new_pause_mora = Mora {
+                        pitch: f0_list[index + 1],
+                        ..pause_mora.clone()
+                    };
+                    index += 1;
+                    new_pause_mora
+                }),
+                ..accent_phrase.clone()
+            })
+            .collect();
 
-            fn create_one_accent_list(
-                accent_list: &mut Vec<i64>,
-                accent_phrase: &AccentPhrase,
-                point: i32,
-            ) {
-                let mut one_accent_list: Vec<i64> = Vec::new();
+        return Ok(new_accent_phrases);
 
-                for (i, mora) in accent_phrase.moras.iter().enumerate() {
-                    let value = (i as i32 == point
-                        || (point < 0 && i == (accent_phrase.moras.len() as i32 + point) as usize))
-                        .into();
+        fn create_one_accent_list(
+            accent_list: &mut Vec<i64>,
+            accent_phrase: &AccentPhrase,
+            point: i32,
+        ) {
+            let mut one_accent_list: Vec<i64> = Vec::new();
+
+            for (i, mora) in accent_phrase.moras.iter().enumerate() {
+                let value = (i as i32 == point
+                    || (point < 0 && i == (accent_phrase.moras.len() as i32 + point) as usize))
+                    .into();
+                one_accent_list.push(value);
+                if mora.consonant.is_some() {
                     one_accent_list.push(value);
-                    if mora.consonant.is_some() {
-                        one_accent_list.push(value);
-                    }
-                }
-                if accent_phrase.pause_mora.is_some() {
-                    one_accent_list.push(0);
                 }
-                accent_list.extend(one_accent_list)
             }
+            if accent_phrase.pause_mora.is_some() {
+                one_accent_list.push(0);
+            }
+            accent_list.extend(one_accent_list)
         }
+    }
 
-        pub(super) async fn create_audio_query_from_kana(
-            &self,
-            kana: &str,
-            style_id: StyleId,
-        ) -> Result<AudioQuery> {
-            let accent_phrases = self.create_accent_phrases_from_kana(kana, style_id).await?;
-            Ok(AudioQuery::from_accent_phrases(accent_phrases).with_kana(Some(kana.to_owned())))
-        }
-
-        pub(super) async fn tts_from_kana(
-            &self,
-            kana: &str,
-            style_id: StyleId,
-            options: &TtsOptions,
-        ) -> Result<Vec<u8>> {
-            let audio_query = &self.create_audio_query_from_kana(kana, style_id).await?;
-            self.synthesis(audio_query, style_id, &SynthesisOptions::from(options))
-                .await
-        }
+    async fn create_audio_query_from_kana(
+        &self,
+        kana: &str,
+        style_id: StyleId,
+    ) -> Result<AudioQuery> {
+        let accent_phrases = self.create_accent_phrases_from_kana(kana, style_id).await?;
+        Ok(AudioQuery::from_accent_phrases(accent_phrases).with_kana(Some(kana.to_owned())))
     }
 
-    impl<O: FullcontextExtractor, A: AsyncExt> Inner<O, A> {
-        pub(super) async fn create_accent_phrases(
-            &self,
-            text: &str,
-            style_id: StyleId,
-        ) -> Result<Vec<AccentPhrase>> {
-            let accent_phrases = self.open_jtalk_analyzer.analyze(text)?;
-            self.replace_mora_data(&accent_phrases, style_id).await
-        }
+    async fn tts_from_kana(
+        &self,
+        kana: &str,
+        style_id: StyleId,
+        options: &TtsOptions,
+    ) -> Result<Vec<u8>> {
+        let audio_query = &self.create_audio_query_from_kana(kana, style_id).await?;
+        self.synthesis(audio_query, style_id, &SynthesisOptions::from(options))
+            .await
+    }
+}
 
-        pub(super) async fn create_audio_query(
-            &self,
-            text: &str,
-            style_id: StyleId,
-        ) -> Result<AudioQuery> {
-            let accent_phrases = self.create_accent_phrases(text, style_id).await?;
-            Ok(AudioQuery::from_accent_phrases(accent_phrases))
-        }
+impl<O: FullcontextExtractor, A: AsyncExt> Inner<O, A> {
+    async fn create_accent_phrases(
+        &self,
+        text: &str,
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhrase>> {
+        let accent_phrases = self.open_jtalk_analyzer.analyze(text)?;
+        self.replace_mora_data(&accent_phrases, style_id).await
+    }
 
-        pub(super) async fn tts(
-            &self,
-            text: &str,
-            style_id: StyleId,
-            options: &TtsOptions,
-        ) -> Result<Vec<u8>> {
-            let audio_query = &self.create_audio_query(text, style_id).await?;
-            self.synthesis(audio_query, style_id, &SynthesisOptions::from(options))
-                .await
-        }
+    async fn create_audio_query(&self, text: &str, style_id: StyleId) -> Result<AudioQuery> {
+        let accent_phrases = self.create_accent_phrases(text, style_id).await?;
+        Ok(AudioQuery::from_accent_phrases(accent_phrases))
     }
 
-    // TODO: この層を破壊する
-    impl<O, A: infer::AsyncExt> Inner<O, A> {
-        pub(super) async fn predict_duration(
-            &self,
-            phoneme_vector: &[i64],
-            style_id: StyleId,
-        ) -> Result<Vec<f32>> {
-            let status = self.status.clone();
-            let phoneme_vector = ndarray::arr1(phoneme_vector);
-            status.predict_duration::<A>(phoneme_vector, style_id).await
-        }
-
-        #[expect(
-            clippy::too_many_arguments,
-            reason = "compatible_engineでの`predict_intonation`の形を考えると、ここの引数を構造体に\
-                      まとめたりしても可読性に寄与しない"
-        )]
-        pub(super) async fn predict_intonation(
-            &self,
-            length: usize,
-            vowel_phoneme_vector: &[i64],
-            consonant_phoneme_vector: &[i64],
-            start_accent_vector: &[i64],
-            end_accent_vector: &[i64],
-            start_accent_phrase_vector: &[i64],
-            end_accent_phrase_vector: &[i64],
-            style_id: StyleId,
-        ) -> Result<Vec<f32>> {
-            let status = self.status.clone();
-            let vowel_phoneme_vector = ndarray::arr1(vowel_phoneme_vector);
-            let consonant_phoneme_vector = ndarray::arr1(consonant_phoneme_vector);
-            let start_accent_vector = ndarray::arr1(start_accent_vector);
-            let end_accent_vector = ndarray::arr1(end_accent_vector);
-            let start_accent_phrase_vector = ndarray::arr1(start_accent_phrase_vector);
-            let end_accent_phrase_vector = ndarray::arr1(end_accent_phrase_vector);
-            status
-                .predict_intonation::<A>(
-                    length,
-                    vowel_phoneme_vector,
-                    consonant_phoneme_vector,
-                    start_accent_vector,
-                    end_accent_vector,
-                    start_accent_phrase_vector,
-                    end_accent_phrase_vector,
-                    style_id,
-                )
-                .await
-        }
+    async fn tts(&self, text: &str, style_id: StyleId, options: &TtsOptions) -> Result<Vec<u8>> {
+        let audio_query = &self.create_audio_query(text, style_id).await?;
+        self.synthesis(audio_query, style_id, &SynthesisOptions::from(options))
+            .await
+    }
+}
 
-        pub(super) async fn generate_full_intermediate(
-            &self,
-            length: usize,
-            phoneme_size: usize,
-            f0: &[f32],
-            phoneme_vector: &[f32],
-            style_id: StyleId,
-        ) -> Result<ndarray::Array2<f32>> {
-            let status = self.status.clone();
-            let f0 = ndarray::arr1(f0);
-            let phoneme_vector = ndarray::arr1(phoneme_vector);
-            status
-                .generate_full_intermediate::<A>(length, phoneme_size, f0, phoneme_vector, style_id)
-                .await
-        }
+// TODO: この層を破壊する
+impl<O, A: infer::AsyncExt> Inner<O, A> {
+    async fn predict_duration(
+        &self,
+        phoneme_vector: &[i64],
+        style_id: StyleId,
+    ) -> Result<Vec<f32>> {
+        let status = self.status.clone();
+        let phoneme_vector = ndarray::arr1(phoneme_vector);
+        status.predict_duration::<A>(phoneme_vector, style_id).await
+    }
 
-        pub(super) async fn render_audio_segment(
-            &self,
-            spec: ndarray::Array2<f32>,
-            style_id: StyleId,
-        ) -> Result<ndarray::Array1<f32>> {
-            let status = self.status.clone();
-            status.render_audio_segment::<A>(spec, style_id).await
-        }
+    #[expect(
+        clippy::too_many_arguments,
+        reason = "compatible_engineでの`predict_intonation`の形を考えると、ここの引数を構造体に\
+                  まとめたりしても可読性に寄与しない"
+    )]
+    async fn predict_intonation(
+        &self,
+        length: usize,
+        vowel_phoneme_vector: &[i64],
+        consonant_phoneme_vector: &[i64],
+        start_accent_vector: &[i64],
+        end_accent_vector: &[i64],
+        start_accent_phrase_vector: &[i64],
+        end_accent_phrase_vector: &[i64],
+        style_id: StyleId,
+    ) -> Result<Vec<f32>> {
+        let status = self.status.clone();
+        let vowel_phoneme_vector = ndarray::arr1(vowel_phoneme_vector);
+        let consonant_phoneme_vector = ndarray::arr1(consonant_phoneme_vector);
+        let start_accent_vector = ndarray::arr1(start_accent_vector);
+        let end_accent_vector = ndarray::arr1(end_accent_vector);
+        let start_accent_phrase_vector = ndarray::arr1(start_accent_phrase_vector);
+        let end_accent_phrase_vector = ndarray::arr1(end_accent_phrase_vector);
+        status
+            .predict_intonation::<A>(
+                length,
+                vowel_phoneme_vector,
+                consonant_phoneme_vector,
+                start_accent_vector,
+                end_accent_vector,
+                start_accent_phrase_vector,
+                end_accent_phrase_vector,
+                style_id,
+            )
+            .await
+    }
 
-        pub(super) async fn decode(
-            &self,
-            length: usize,
-            phoneme_size: usize,
-            f0: &[f32],
-            phoneme_vector: &[f32],
-            style_id: StyleId,
-        ) -> Result<Vec<f32>> {
-            let status = self.status.clone();
-            let f0 = ndarray::arr1(f0);
-            let phoneme_vector = ndarray::arr1(phoneme_vector);
-            status
-                .decode::<A>(length, phoneme_size, f0, phoneme_vector, style_id)
-                .await
-        }
+    async fn generate_full_intermediate(
+        &self,
+        length: usize,
+        phoneme_size: usize,
+        f0: &[f32],
+        phoneme_vector: &[f32],
+        style_id: StyleId,
+    ) -> Result<ndarray::Array2<f32>> {
+        let status = self.status.clone();
+        let f0 = ndarray::arr1(f0);
+        let phoneme_vector = ndarray::arr1(phoneme_vector);
+        status
+            .generate_full_intermediate::<A>(length, phoneme_size, f0, phoneme_vector, style_id)
+            .await
     }
 
-    impl<R: InferenceRuntime> Status<R> {
-        pub(super) async fn predict_duration<A: infer::AsyncExt>(
-            &self,
-            phoneme_vector: ndarray::Array1<i64>,
-            style_id: StyleId,
-        ) -> Result<Vec<f32>> {
-            let (model_id, inner_voice_id) = self.ids_for::<TalkDomain>(style_id)?;
-
-            let PredictDurationOutput {
-                phoneme_length: output,
-            } = self
-                .run_session::<A, _>(
-                    model_id,
-                    PredictDurationInput {
-                        phoneme_list: phoneme_vector,
-                        speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
-                    },
-                )
-                .await?;
-            let mut output = output.into_raw_vec();
+    async fn render_audio_segment(
+        &self,
+        spec: ndarray::Array2<f32>,
+        style_id: StyleId,
+    ) -> Result<ndarray::Array1<f32>> {
+        let status = self.status.clone();
+        status.render_audio_segment::<A>(spec, style_id).await
+    }
 
-            for output_item in output.iter_mut() {
-                if *output_item < PHONEME_LENGTH_MINIMAL {
-                    *output_item = PHONEME_LENGTH_MINIMAL;
-                }
-            }
+    async fn decode(
+        &self,
+        length: usize,
+        phoneme_size: usize,
+        f0: &[f32],
+        phoneme_vector: &[f32],
+        style_id: StyleId,
+    ) -> Result<Vec<f32>> {
+        let status = self.status.clone();
+        let f0 = ndarray::arr1(f0);
+        let phoneme_vector = ndarray::arr1(phoneme_vector);
+        status
+            .decode::<A>(length, phoneme_size, f0, phoneme_vector, style_id)
+            .await
+    }
+}
 
-            return Ok(output);
+impl<R: InferenceRuntime> Status<R> {
+    async fn predict_duration<A: infer::AsyncExt>(
+        &self,
+        phoneme_vector: ndarray::Array1<i64>,
+        style_id: StyleId,
+    ) -> Result<Vec<f32>> {
+        let (model_id, inner_voice_id) = self.ids_for::<TalkDomain>(style_id)?;
+
+        let PredictDurationOutput {
+            phoneme_length: output,
+        } = self
+            .run_session::<A, _>(
+                model_id,
+                PredictDurationInput {
+                    phoneme_list: phoneme_vector,
+                    speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
+                },
+            )
+            .await?;
+        let mut output = output.into_raw_vec();
 
-            const PHONEME_LENGTH_MINIMAL: f32 = 0.01;
+        for output_item in output.iter_mut() {
+            if *output_item < PHONEME_LENGTH_MINIMAL {
+                *output_item = PHONEME_LENGTH_MINIMAL;
+            }
         }
 
-        #[expect(
-            clippy::too_many_arguments,
-            reason = "compatible_engineでの`predict_intonation`の形を考えると、ここの引数を構造体に\
-                      まとめたりしても可読性に寄与しない"
-        )]
-        pub(super) async fn predict_intonation<A: infer::AsyncExt>(
-            &self,
-            length: usize,
-            vowel_phoneme_vector: ndarray::Array1<i64>,
-            consonant_phoneme_vector: ndarray::Array1<i64>,
-            start_accent_vector: ndarray::Array1<i64>,
-            end_accent_vector: ndarray::Array1<i64>,
-            start_accent_phrase_vector: ndarray::Array1<i64>,
-            end_accent_phrase_vector: ndarray::Array1<i64>,
-            style_id: StyleId,
-        ) -> Result<Vec<f32>> {
-            let (model_id, inner_voice_id) = self.ids_for::<TalkDomain>(style_id)?;
-
-            let PredictIntonationOutput { f0_list: output } = self
-                .run_session::<A, _>(
-                    model_id,
-                    PredictIntonationInput {
-                        length: ndarray::arr0(length as i64),
-                        vowel_phoneme_list: vowel_phoneme_vector,
-                        consonant_phoneme_list: consonant_phoneme_vector,
-                        start_accent_list: start_accent_vector,
-                        end_accent_list: end_accent_vector,
-                        start_accent_phrase_list: start_accent_phrase_vector,
-                        end_accent_phrase_list: end_accent_phrase_vector,
-                        speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
-                    },
-                )
-                .await?;
+        return Ok(output);
 
-            Ok(output.into_raw_vec())
-        }
+        const PHONEME_LENGTH_MINIMAL: f32 = 0.01;
+    }
 
-        /// モデル`generate_full_intermediate`の実行と、その前後の処理を行う。
-        ///
-        /// 無音パディングを付加して音声特徴量を計算し、マージン込みの音声特徴量を返す。
-        pub(super) async fn generate_full_intermediate<A: infer::AsyncExt>(
-            &self,
-            length: usize,
-            phoneme_size: usize,
-            f0: ndarray::Array1<f32>,
-            phoneme_vector: ndarray::Array1<f32>,
-            style_id: StyleId,
-        ) -> Result<ndarray::Array2<f32>> {
-            let (model_id, inner_voice_id) = self.ids_for::<TalkDomain>(style_id)?;
-
-            // 音が途切れてしまうのを避けるworkaround処理が入っている
-            // TODO: 改善したらここのpadding処理を取り除く
-            let start_and_end_padding_size = 2 * PADDING_FRAME_LENGTH;
-            let length_with_padding = f0.len() + start_and_end_padding_size;
-            let f0_with_padding = make_f0_with_padding(f0, PADDING_FRAME_LENGTH);
-            let phoneme_with_padding = make_phoneme_with_padding(
-                phoneme_vector.into_shape([length, phoneme_size]).unwrap(),
-                PADDING_FRAME_LENGTH,
-            );
+    #[expect(
+        clippy::too_many_arguments,
+        reason = "compatible_engineでの`predict_intonation`の形を考えると、ここの引数を構造体に\
+                  まとめたりしても可読性に寄与しない"
+    )]
+    async fn predict_intonation<A: infer::AsyncExt>(
+        &self,
+        length: usize,
+        vowel_phoneme_vector: ndarray::Array1<i64>,
+        consonant_phoneme_vector: ndarray::Array1<i64>,
+        start_accent_vector: ndarray::Array1<i64>,
+        end_accent_vector: ndarray::Array1<i64>,
+        start_accent_phrase_vector: ndarray::Array1<i64>,
+        end_accent_phrase_vector: ndarray::Array1<i64>,
+        style_id: StyleId,
+    ) -> Result<Vec<f32>> {
+        let (model_id, inner_voice_id) = self.ids_for::<TalkDomain>(style_id)?;
+
+        let PredictIntonationOutput { f0_list: output } = self
+            .run_session::<A, _>(
+                model_id,
+                PredictIntonationInput {
+                    length: ndarray::arr0(length as i64),
+                    vowel_phoneme_list: vowel_phoneme_vector,
+                    consonant_phoneme_list: consonant_phoneme_vector,
+                    start_accent_list: start_accent_vector,
+                    end_accent_list: end_accent_vector,
+                    start_accent_phrase_list: start_accent_phrase_vector,
+                    end_accent_phrase_list: end_accent_phrase_vector,
+                    speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
+                },
+            )
+            .await?;
 
-            let GenerateFullIntermediateOutput {
-                spec: spec_with_padding,
-            } = self
-                .run_session::<A, _>(
-                    model_id,
-                    GenerateFullIntermediateInput {
-                        f0: f0_with_padding
-                            .into_shape([length_with_padding, 1])
-                            .unwrap(),
-                        phoneme: phoneme_with_padding,
-                        speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
-                    },
-                )
-                .await?;
+        Ok(output.into_raw_vec())
+    }
 
-            // マージンがデータからはみ出さないことを保証
-            // cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291
-            if MARGIN > PADDING_FRAME_LENGTH {
-                unreachable!("Validation error: Too short padding for input, please report this issue on GitHub.");
-            }
-            // マージン分を両端に残して音声特徴量を返す
-            return Ok(spec_with_padding
-                .slice(ndarray::s![
-                    PADDING_FRAME_LENGTH - MARGIN
-                        ..spec_with_padding.nrows() - PADDING_FRAME_LENGTH + MARGIN,
-                    ..
-                ])
-                .to_owned());
-
-            fn make_f0_with_padding(
-                f0_slice: ndarray::Array1<f32>,
-                padding_size: usize,
-            ) -> ndarray::Array1<f32> {
-                // 音が途切れてしまうのを避けるworkaround処理
-                // 改善したらこの関数を削除する
-                let padding = ndarray::Array1::<f32>::zeros(padding_size);
-                ndarray::concatenate![ndarray::Axis(0), padding, f0_slice, padding]
-            }
+    /// モデル`generate_full_intermediate`の実行と、その前後の処理を行う。
+    ///
+    /// 無音パディングを付加して音声特徴量を計算し、マージン込みの音声特徴量を返す。
+    async fn generate_full_intermediate<A: infer::AsyncExt>(
+        &self,
+        length: usize,
+        phoneme_size: usize,
+        f0: ndarray::Array1<f32>,
+        phoneme_vector: ndarray::Array1<f32>,
+        style_id: StyleId,
+    ) -> Result<ndarray::Array2<f32>> {
+        let (model_id, inner_voice_id) = self.ids_for::<TalkDomain>(style_id)?;
+
+        // 音が途切れてしまうのを避けるworkaround処理が入っている
+        // TODO: 改善したらここのpadding処理を取り除く
+        let start_and_end_padding_size = 2 * PADDING_FRAME_LENGTH;
+        let length_with_padding = f0.len() + start_and_end_padding_size;
+        let f0_with_padding = make_f0_with_padding(f0, PADDING_FRAME_LENGTH);
+        let phoneme_with_padding = make_phoneme_with_padding(
+            phoneme_vector.into_shape([length, phoneme_size]).unwrap(),
+            PADDING_FRAME_LENGTH,
+        );
 
-            fn make_phoneme_with_padding(
-                phoneme_slice: ndarray::Array2<f32>,
-                padding_size: usize,
-            ) -> ndarray::Array2<f32> {
-                // 音が途切れてしまうのを避けるworkaround処理
-                // 改善したらこの関数を削除する
-                let mut padding =
-                    ndarray::Array2::<f32>::zeros((padding_size, phoneme_slice.ncols()));
-                padding
-                    .slice_mut(ndarray::s![.., 0])
-                    .assign(&ndarray::arr0(1.0));
-                ndarray::concatenate![ndarray::Axis(0), padding, phoneme_slice, padding]
-            }
+        let GenerateFullIntermediateOutput {
+            spec: spec_with_padding,
+        } = self
+            .run_session::<A, _>(
+                model_id,
+                GenerateFullIntermediateInput {
+                    f0: f0_with_padding
+                        .into_shape([length_with_padding, 1])
+                        .unwrap(),
+                    phoneme: phoneme_with_padding,
+                    speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
+                },
+            )
+            .await?;
+
+        // マージンがデータからはみ出さないことを保証
+        // cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291
+        if MARGIN > PADDING_FRAME_LENGTH {
+            unreachable!("Validation error: Too short padding for input, please report this issue on GitHub.");
+        }
+        // マージン分を両端に残して音声特徴量を返す
+        return Ok(spec_with_padding
+            .slice(ndarray::s![
+                PADDING_FRAME_LENGTH - MARGIN
+                    ..spec_with_padding.nrows() - PADDING_FRAME_LENGTH + MARGIN,
+                ..
+            ])
+            .to_owned());
+
+        fn make_f0_with_padding(
+            f0_slice: ndarray::Array1<f32>,
+            padding_size: usize,
+        ) -> ndarray::Array1<f32> {
+            // 音が途切れてしまうのを避けるworkaround処理
+            // 改善したらこの関数を削除する
+            let padding = ndarray::Array1::<f32>::zeros(padding_size);
+            ndarray::concatenate![ndarray::Axis(0), padding, f0_slice, padding]
+        }
+
+        fn make_phoneme_with_padding(
+            phoneme_slice: ndarray::Array2<f32>,
+            padding_size: usize,
+        ) -> ndarray::Array2<f32> {
+            // 音が途切れてしまうのを避けるworkaround処理
+            // 改善したらこの関数を削除する
+            let mut padding = ndarray::Array2::<f32>::zeros((padding_size, phoneme_slice.ncols()));
+            padding
+                .slice_mut(ndarray::s![.., 0])
+                .assign(&ndarray::arr0(1.0));
+            ndarray::concatenate![ndarray::Axis(0), padding, phoneme_slice, padding]
         }
+    }
 
-        /// 与えられた音声特徴量で音声生成。
-        pub(super) async fn render_audio_segment<A: infer::AsyncExt>(
-            &self,
-            spec: ndarray::Array2<f32>,
-            style_id: StyleId,
-        ) -> Result<ndarray::Array1<f32>> {
-            let (model_id, _inner_voice_id) = self.ids_for::<TalkDomain>(style_id)?;
-            let RenderAudioSegmentOutput { wave } = self
-                .run_session::<A, _>(model_id, RenderAudioSegmentInput { spec })
-                .await?;
-            Ok(wave)
-        }
+    /// 与えられた音声特徴量で音声生成。
+    async fn render_audio_segment<A: infer::AsyncExt>(
+        &self,
+        spec: ndarray::Array2<f32>,
+        style_id: StyleId,
+    ) -> Result<ndarray::Array1<f32>> {
+        let (model_id, _inner_voice_id) = self.ids_for::<TalkDomain>(style_id)?;
+        let RenderAudioSegmentOutput { wave } = self
+            .run_session::<A, _>(model_id, RenderAudioSegmentInput { spec })
+            .await?;
+        Ok(wave)
+    }
 
-        pub(super) async fn decode<A: infer::AsyncExt>(
-            &self,
-            length: usize,
-            phoneme_size: usize,
-            f0: ndarray::Array1<f32>,
-            phoneme_vector: ndarray::Array1<f32>,
-            style_id: StyleId,
-        ) -> Result<Vec<f32>> {
-            let intermediate = self
-                .generate_full_intermediate::<A>(length, phoneme_size, f0, phoneme_vector, style_id)
-                .await?;
-            let output_with_margin = self
-                .render_audio_segment::<A>(intermediate, style_id)
-                .await?;
-            let output = trim_margin_from_wave(output_with_margin);
-            Ok(output.to_vec())
-        }
-
-        pub(super) async fn predict_sing_consonant_length<A: infer::AsyncExt>(
-            &self,
-            consonant: ndarray::Array1<i64>,
-            vowel: ndarray::Array1<i64>,
-            note_duration: ndarray::Array1<i64>,
-            style_id: StyleId,
-        ) -> Result<ndarray::Array2<i64>> {
-            let (model_id, inner_voice_id) = self.ids_for::<SingingTeacherDomain>(style_id)?;
-
-            let PredictSingConsonantLengthOutput { consonant_lengths } = self
-                .run_session::<A, _>(
-                    model_id,
-                    PredictSingConsonantLengthInput {
-                        consonants: consonant.into_one_row(),
-                        vowels: vowel.into_one_row(),
-                        note_durations: note_duration.into_one_row(),
-                        speaker_id: ndarray::array![inner_voice_id.raw_id().into()],
-                    },
-                )
-                .await?;
+    async fn decode<A: infer::AsyncExt>(
+        &self,
+        length: usize,
+        phoneme_size: usize,
+        f0: ndarray::Array1<f32>,
+        phoneme_vector: ndarray::Array1<f32>,
+        style_id: StyleId,
+    ) -> Result<Vec<f32>> {
+        let intermediate = self
+            .generate_full_intermediate::<A>(length, phoneme_size, f0, phoneme_vector, style_id)
+            .await?;
+        let output_with_margin = self
+            .render_audio_segment::<A>(intermediate, style_id)
+            .await?;
+        let output = trim_margin_from_wave(output_with_margin);
+        Ok(output.to_vec())
+    }
 
-            Ok(consonant_lengths)
-        }
+    async fn predict_sing_consonant_length<A: infer::AsyncExt>(
+        &self,
+        consonant: ndarray::Array1<i64>,
+        vowel: ndarray::Array1<i64>,
+        note_duration: ndarray::Array1<i64>,
+        style_id: StyleId,
+    ) -> Result<ndarray::Array2<i64>> {
+        let (model_id, inner_voice_id) = self.ids_for::<SingingTeacherDomain>(style_id)?;
+
+        let PredictSingConsonantLengthOutput { consonant_lengths } = self
+            .run_session::<A, _>(
+                model_id,
+                PredictSingConsonantLengthInput {
+                    consonants: consonant.into_one_row(),
+                    vowels: vowel.into_one_row(),
+                    note_durations: note_duration.into_one_row(),
+                    speaker_id: ndarray::array![inner_voice_id.raw_id().into()],
+                },
+            )
+            .await?;
 
-        pub(super) async fn predict_sing_f0<A: infer::AsyncExt>(
-            &self,
-            phoneme: ndarray::Array1<i64>,
-            note: ndarray::Array1<i64>,
-            style_id: StyleId,
-        ) -> Result<ndarray::Array2<f32>> {
-            let (model_id, inner_voice_id) = self.ids_for::<SingingTeacherDomain>(style_id)?;
-
-            let PredictSingF0Output { f0s } = self
-                .run_session::<A, _>(
-                    model_id,
-                    PredictSingF0Input {
-                        phonemes: phoneme.into_one_row(),
-                        notes: note.into_one_row(),
-                        speaker_id: ndarray::array![inner_voice_id.raw_id().into()],
-                    },
-                )
-                .await?;
+        Ok(consonant_lengths)
+    }
 
-            Ok(f0s)
-        }
+    async fn predict_sing_f0<A: infer::AsyncExt>(
+        &self,
+        phoneme: ndarray::Array1<i64>,
+        note: ndarray::Array1<i64>,
+        style_id: StyleId,
+    ) -> Result<ndarray::Array2<f32>> {
+        let (model_id, inner_voice_id) = self.ids_for::<SingingTeacherDomain>(style_id)?;
+
+        let PredictSingF0Output { f0s } = self
+            .run_session::<A, _>(
+                model_id,
+                PredictSingF0Input {
+                    phonemes: phoneme.into_one_row(),
+                    notes: note.into_one_row(),
+                    speaker_id: ndarray::array![inner_voice_id.raw_id().into()],
+                },
+            )
+            .await?;
 
-        pub(super) async fn predict_sing_volume<A: infer::AsyncExt>(
-            &self,
-            phoneme: ndarray::Array1<i64>,
-            note: ndarray::Array1<i64>,
-            f0: ndarray::Array1<f32>,
-            style_id: StyleId,
-        ) -> Result<ndarray::Array2<f32>> {
-            let (model_id, inner_voice_id) = self.ids_for::<SingingTeacherDomain>(style_id)?;
-
-            let PredictSingVolumeOutput { volumes } = self
-                .run_session::<A, _>(
-                    model_id,
-                    PredictSingVolumeInput {
-                        phonemes: phoneme.into_one_row(),
-                        notes: note.into_one_row(),
-                        frame_f0s: f0.into_one_row(),
-                        speaker_id: ndarray::array![inner_voice_id.raw_id().into()],
-                    },
-                )
-                .await?;
+        Ok(f0s)
+    }
 
-            Ok(volumes)
-        }
+    async fn predict_sing_volume<A: infer::AsyncExt>(
+        &self,
+        phoneme: ndarray::Array1<i64>,
+        note: ndarray::Array1<i64>,
+        f0: ndarray::Array1<f32>,
+        style_id: StyleId,
+    ) -> Result<ndarray::Array2<f32>> {
+        let (model_id, inner_voice_id) = self.ids_for::<SingingTeacherDomain>(style_id)?;
+
+        let PredictSingVolumeOutput { volumes } = self
+            .run_session::<A, _>(
+                model_id,
+                PredictSingVolumeInput {
+                    phonemes: phoneme.into_one_row(),
+                    notes: note.into_one_row(),
+                    frame_f0s: f0.into_one_row(),
+                    speaker_id: ndarray::array![inner_voice_id.raw_id().into()],
+                },
+            )
+            .await?;
 
-        pub(super) async fn sf_decode<A: infer::AsyncExt>(
-            &self,
-            phoneme: ndarray::Array1<i64>,
-            f0: ndarray::Array1<f32>,
-            volume: ndarray::Array1<f32>,
-            style_id: StyleId,
-        ) -> Result<ndarray::Array2<f32>> {
-            let (model_id, inner_voice_id) = self.ids_for::<FrameDecodeDomain>(style_id)?;
-
-            let SfDecodeOutput { wav } = self
-                .run_session::<A, _>(
-                    model_id,
-                    SfDecodeInput {
-                        frame_phonemes: phoneme.into_one_row(),
-                        frame_f0s: f0.into_one_row(),
-                        frame_volumes: volume.into_one_row(),
-                        speaker_id: ndarray::array![inner_voice_id.raw_id().into()],
-                    },
-                )
-                .await?;
+        Ok(volumes)
+    }
 
-            Ok(wav)
-        }
+    async fn sf_decode<A: infer::AsyncExt>(
+        &self,
+        phoneme: ndarray::Array1<i64>,
+        f0: ndarray::Array1<f32>,
+        volume: ndarray::Array1<f32>,
+        style_id: StyleId,
+    ) -> Result<ndarray::Array2<f32>> {
+        let (model_id, inner_voice_id) = self.ids_for::<FrameDecodeDomain>(style_id)?;
+
+        let SfDecodeOutput { wav } = self
+            .run_session::<A, _>(
+                model_id,
+                SfDecodeInput {
+                    frame_phonemes: phoneme.into_one_row(),
+                    frame_f0s: f0.into_one_row(),
+                    frame_volumes: volume.into_one_row(),
+                    speaker_id: ndarray::array![inner_voice_id.raw_id().into()],
+                },
+            )
+            .await?;
+
+        Ok(wav)
     }
+}
 
-    #[ext]
-    impl<T> ndarray::Array1<T> {
-        fn into_one_row(self) -> ndarray::Array2<T> {
-            let n = self.len();
-            self.into_shape([1, n]).expect("should be ok")
-        }
+#[ext]
+impl<T> ndarray::Array1<T> {
+    fn into_one_row(self) -> ndarray::Array2<T> {
+        let n = self.len();
+        self.into_shape([1, n]).expect("should be ok")
     }
+}
 
-    #[cfg(windows)]
-    fn list_windows_video_cards() {
-        use std::{ffi::OsString, os::windows::ffi::OsStringExt as _};
+#[cfg(windows)]
+fn list_windows_video_cards() {
+    use std::{ffi::OsString, os::windows::ffi::OsStringExt as _};
 
-        use humansize::BINARY;
-        use tracing::{error, info};
-        use windows::Win32::Graphics::Dxgi::{
-            CreateDXGIFactory, IDXGIFactory, DXGI_ADAPTER_DESC, DXGI_ERROR_NOT_FOUND,
-        };
+    use humansize::BINARY;
+    use tracing::{error, info};
+    use windows::Win32::Graphics::Dxgi::{
+        CreateDXGIFactory, IDXGIFactory, DXGI_ADAPTER_DESC, DXGI_ERROR_NOT_FOUND,
+    };
 
-        info!("検出されたGPU (DirectMLにはGPU 0が使われます):");
-        match list_windows_video_cards() {
-            Ok(descs) => {
-                for (device_id, desc) in descs.into_iter().enumerate() {
-                    let description = OsString::from_wide(trim_nul(&desc.Description));
-                    let vram = humansize::format_size(desc.DedicatedVideoMemory, BINARY);
-                    info!("  GPU {device_id}: {description:?} ({vram})");
-                }
+    info!("検出されたGPU (DirectMLにはGPU 0が使われます):");
+    match list_windows_video_cards() {
+        Ok(descs) => {
+            for (device_id, desc) in descs.into_iter().enumerate() {
+                let description = OsString::from_wide(trim_nul(&desc.Description));
+                let vram = humansize::format_size(desc.DedicatedVideoMemory, BINARY);
+                info!("  GPU {device_id}: {description:?} ({vram})");
             }
-            Err(err) => error!("{err}"),
         }
+        Err(err) => error!("{err}"),
+    }
 
-        fn list_windows_video_cards() -> windows::core::Result<Vec<DXGI_ADAPTER_DESC>> {
-            unsafe {
-                let factory = CreateDXGIFactory::<IDXGIFactory>()?;
-                (0..)
-                    .map(|i| factory.EnumAdapters(i)?.GetDesc())
-                    .take_while(|r| !matches!(r, Err(e) if e.code() == DXGI_ERROR_NOT_FOUND))
-                    .collect()
-            }
+    fn list_windows_video_cards() -> windows::core::Result<Vec<DXGI_ADAPTER_DESC>> {
+        unsafe {
+            let factory = CreateDXGIFactory::<IDXGIFactory>()?;
+            (0..)
+                .map(|i| factory.EnumAdapters(i)?.GetDesc())
+                .take_while(|r| !matches!(r, Err(e) if e.code() == DXGI_ERROR_NOT_FOUND))
+                .collect()
         }
+    }
 
-        fn trim_nul(s: &[u16]) -> &[u16] {
-            &s[..s.iter().position(|&c| c == 0x0000).unwrap_or(s.len())]
-        }
+    fn trim_nul(s: &[u16]) -> &[u16] {
+        &s[..s.iter().position(|&c| c == 0x0000).unwrap_or(s.len())]
     }
+}
 
-    fn initial_process(accent_phrases: &[AccentPhrase]) -> (Vec<Mora>, Vec<OjtPhoneme>) {
-        let flatten_moras = to_flatten_moras(accent_phrases);
+fn initial_process(accent_phrases: &[AccentPhrase]) -> (Vec<Mora>, Vec<OjtPhoneme>) {
+    let flatten_moras = to_flatten_moras(accent_phrases);
 
-        let mut phoneme_strings = vec!["pau".to_string()];
-        for mora in flatten_moras.iter() {
-            if let Some(consonant) = &mora.consonant {
-                phoneme_strings.push(consonant.clone())
-            }
-            phoneme_strings.push(mora.vowel.clone());
+    let mut phoneme_strings = vec!["pau".to_string()];
+    for mora in flatten_moras.iter() {
+        if let Some(consonant) = &mora.consonant {
+            phoneme_strings.push(consonant.clone())
         }
-        phoneme_strings.push("pau".to_string());
+        phoneme_strings.push(mora.vowel.clone());
+    }
+    phoneme_strings.push("pau".to_string());
 
-        let phoneme_data_list = to_phoneme_data_list(&phoneme_strings);
+    let phoneme_data_list = to_phoneme_data_list(&phoneme_strings);
 
-        return (flatten_moras, phoneme_data_list);
+    return (flatten_moras, phoneme_data_list);
 
-        fn to_flatten_moras(accent_phrases: &[AccentPhrase]) -> Vec<Mora> {
-            let mut flatten_moras = Vec::new();
+    fn to_flatten_moras(accent_phrases: &[AccentPhrase]) -> Vec<Mora> {
+        let mut flatten_moras = Vec::new();
 
-            for AccentPhrase {
-                moras, pause_mora, ..
-            } in accent_phrases
-            {
-                for mora in moras {
-                    flatten_moras.push(mora.clone());
-                }
-                if let Some(pause_mora) = pause_mora {
-                    flatten_moras.push(pause_mora.clone());
-                }
+        for AccentPhrase {
+            moras, pause_mora, ..
+        } in accent_phrases
+        {
+            for mora in moras {
+                flatten_moras.push(mora.clone());
+            }
+            if let Some(pause_mora) = pause_mora {
+                flatten_moras.push(pause_mora.clone());
             }
-
-            flatten_moras
         }
 
-        fn to_phoneme_data_list<T: AsRef<str>>(phoneme_str_list: &[T]) -> Vec<OjtPhoneme> {
-            OjtPhoneme::convert(
-                phoneme_str_list
-                    .iter()
-                    .map(AsRef::as_ref)
-                    .map(ToOwned::to_owned)
-                    .map(OjtPhoneme::new)
-                    .collect::<Vec<OjtPhoneme>>()
-                    .as_slice(),
-            )
-        }
+        flatten_moras
     }
 
-    fn split_mora(phoneme_list: &[OjtPhoneme]) -> (Vec<OjtPhoneme>, Vec<OjtPhoneme>, Vec<i64>) {
-        let mut vowel_indexes = Vec::new();
-        for (i, phoneme) in phoneme_list.iter().enumerate() {
-            const MORA_PHONEME_LIST: &[&str] = &[
-                "a", "i", "u", "e", "o", "N", "A", "I", "U", "E", "O", "cl", "pau",
-            ];
-
-            if MORA_PHONEME_LIST
+    fn to_phoneme_data_list<T: AsRef<str>>(phoneme_str_list: &[T]) -> Vec<OjtPhoneme> {
+        OjtPhoneme::convert(
+            phoneme_str_list
                 .iter()
-                .any(|mora_phoneme| *mora_phoneme == phoneme.phoneme())
-            {
-                vowel_indexes.push(i as i64);
-            }
-        }
+                .map(AsRef::as_ref)
+                .map(ToOwned::to_owned)
+                .map(OjtPhoneme::new)
+                .collect::<Vec<OjtPhoneme>>()
+                .as_slice(),
+        )
+    }
+}
 
-        let vowel_phoneme_list = vowel_indexes
+fn split_mora(phoneme_list: &[OjtPhoneme]) -> (Vec<OjtPhoneme>, Vec<OjtPhoneme>, Vec<i64>) {
+    let mut vowel_indexes = Vec::new();
+    for (i, phoneme) in phoneme_list.iter().enumerate() {
+        const MORA_PHONEME_LIST: &[&str] = &[
+            "a", "i", "u", "e", "o", "N", "A", "I", "U", "E", "O", "cl", "pau",
+        ];
+
+        if MORA_PHONEME_LIST
             .iter()
-            .map(|vowel_index| phoneme_list[*vowel_index as usize].clone())
-            .collect();
+            .any(|mora_phoneme| *mora_phoneme == phoneme.phoneme())
+        {
+            vowel_indexes.push(i as i64);
+        }
+    }
 
-        let mut consonant_phoneme_list = vec![OjtPhoneme::default()];
-        for i in 0..(vowel_indexes.len() - 1) {
-            let prev = vowel_indexes[i];
-            let next = vowel_indexes[i + 1];
-            if next - prev == 1 {
-                consonant_phoneme_list.push(OjtPhoneme::default());
-            } else {
-                consonant_phoneme_list.push(phoneme_list[next as usize - 1].clone());
-            }
+    let vowel_phoneme_list = vowel_indexes
+        .iter()
+        .map(|vowel_index| phoneme_list[*vowel_index as usize].clone())
+        .collect();
+
+    let mut consonant_phoneme_list = vec![OjtPhoneme::default()];
+    for i in 0..(vowel_indexes.len() - 1) {
+        let prev = vowel_indexes[i];
+        let next = vowel_indexes[i + 1];
+        if next - prev == 1 {
+            consonant_phoneme_list.push(OjtPhoneme::default());
+        } else {
+            consonant_phoneme_list.push(phoneme_list[next as usize - 1].clone());
         }
+    }
 
-        (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes)
-    }
-
-    impl AudioQuery {
-        fn from_accent_phrases(accent_phrases: Vec<AccentPhrase>) -> Self {
-            let kana = create_kana(&accent_phrases);
-            Self {
-                accent_phrases,
-                speed_scale: 1.,
-                pitch_scale: 0.,
-                intonation_scale: 1.,
-                volume_scale: 1.,
-                pre_phoneme_length: 0.1,
-                post_phoneme_length: 0.1,
-                output_sampling_rate: DEFAULT_SAMPLING_RATE,
-                output_stereo: false,
-                pause_length: (),
-                pause_length_scale: (),
-                kana: Some(kana),
-            }
+    (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes)
+}
+
+impl AudioQuery {
+    fn from_accent_phrases(accent_phrases: Vec<AccentPhrase>) -> Self {
+        let kana = create_kana(&accent_phrases);
+        Self {
+            accent_phrases,
+            speed_scale: 1.,
+            pitch_scale: 0.,
+            intonation_scale: 1.,
+            volume_scale: 1.,
+            pre_phoneme_length: 0.1,
+            post_phoneme_length: 0.1,
+            output_sampling_rate: DEFAULT_SAMPLING_RATE,
+            output_stereo: false,
+            pause_length: (),
+            pause_length_scale: (),
+            kana: Some(kana),
         }
     }
 }
@@ -1324,9 +1294,9 @@ pub(crate) mod blocking {
         FullcontextExtractor, StyleId, VoiceModelId, VoiceModelMeta,
     };
 
-    use super::{inner::Inner, InitializeOptions, SynthesisOptions, TtsOptions};
+    use super::{InitializeOptions, Inner, SynthesisOptions, TtsOptions};
 
-    pub use super::inner::AudioFeature;
+    pub use super::AudioFeature;
 
     /// 音声シンセサイザ。
     pub struct Synthesizer<O>(pub(super) Inner<O, SingleTasked>);
@@ -1772,7 +1742,7 @@ pub(crate) mod nonblocking {
         StyleId, SynthesisOptions, VoiceModelId, VoiceModelMeta,
     };
 
-    use super::{inner::Inner, InitializeOptions, TtsOptions};
+    use super::{InitializeOptions, Inner, TtsOptions};
 
     /// 音声シンセサイザ。
     ///
diff --git a/crates/voicevox_core_c_api/tests/e2e/log_mask.rs b/crates/voicevox_core_c_api/tests/e2e/log_mask.rs
index f4771ac3e..a9ffe0f6e 100644
--- a/crates/voicevox_core_c_api/tests/e2e/log_mask.rs
+++ b/crates/voicevox_core_c_api/tests/e2e/log_mask.rs
@@ -31,7 +31,7 @@ impl Utf8Output {
     pub(crate) fn mask_windows_video_cards(self) -> Self {
         self.mask_stderr(
             static_regex!(
-                r#"(?m)^\{timestamp\}  INFO voicevox_core::synthesizer::inner: 検出されたGPU \(DirectMLにはGPU 0が使われます\):(\n\{timestamp\}  INFO voicevox_core::synthesizer::inner:   GPU [0-9]+: "[^"]+" \([0-9.]+ [a-zA-Z]+\))+"#,
+                r#"(?m)^\{timestamp\}  INFO voicevox_core::synthesizer: 検出されたGPU \(DirectMLにはGPU 0が使われます\):(\n\{timestamp\}  INFO voicevox_core::synthesizer:   GPU [0-9]+: "[^"]+" \([0-9.]+ [a-zA-Z]+\))+"#,
             ),
             "{windows-video-cards}",
         )
diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml
index c45292fad..87a704af4 100644
--- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml
+++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml
@@ -72,10 +72,10 @@ metas = '''
 ]'''
 stderr.windows = '''
 {windows-video-cards}
-{timestamp}  INFO voicevox_core::synthesizer::inner: CPUを利用します
+{timestamp}  INFO voicevox_core::synthesizer: CPUを利用します
 '''
 stderr.unix = '''
-{timestamp}  INFO voicevox_core::synthesizer::inner: CPUを利用します
+{timestamp}  INFO voicevox_core::synthesizer: CPUを利用します
 '''
 
 [compatible_engine_load_model_before_initialize]
@@ -138,10 +138,10 @@ stderr = ''
 output."こんにちは、音声合成の世界へようこそ".wav_length = 176172
 stderr.windows = '''
 {windows-video-cards}
-{timestamp}  INFO voicevox_core::synthesizer::inner: CPUを利用します
+{timestamp}  INFO voicevox_core::synthesizer: CPUを利用します
 '''
 stderr.unix = '''
-{timestamp}  INFO voicevox_core::synthesizer::inner: CPUを利用します
+{timestamp}  INFO voicevox_core::synthesizer: CPUを利用します
 '''
 
 [synthesizer_new_output_json]
@@ -218,29 +218,29 @@ metas = '''
 ]'''
 stderr.windows = '''
 {windows-video-cards}
-{timestamp}  INFO voicevox_core::synthesizer::inner: CPUを利用します
+{timestamp}  INFO voicevox_core::synthesizer: CPUを利用します
 '''
 stderr.unix = '''
-{timestamp}  INFO voicevox_core::synthesizer::inner: CPUを利用します
+{timestamp}  INFO voicevox_core::synthesizer: CPUを利用します
 '''
 
 [tts_via_audio_query]
 output."こんにちは、音声合成の世界へようこそ".wav_length = 176172
 stderr.windows = '''
 {windows-video-cards}
-{timestamp}  INFO voicevox_core::synthesizer::inner: CPUを利用します
+{timestamp}  INFO voicevox_core::synthesizer: CPUを利用します
 '''
 stderr.unix = '''
-{timestamp}  INFO voicevox_core::synthesizer::inner: CPUを利用します
+{timestamp}  INFO voicevox_core::synthesizer: CPUを利用します
 '''
 
 [user_dict_load]
 stderr.windows = '''
 {windows-video-cards}
-{timestamp}  INFO voicevox_core::synthesizer::inner: CPUを利用します
+{timestamp}  INFO voicevox_core::synthesizer: CPUを利用します
 '''
 stderr.unix = '''
-{timestamp}  INFO voicevox_core::synthesizer::inner: CPUを利用します
+{timestamp}  INFO voicevox_core::synthesizer: CPUを利用します
 '''
 
 [user_dict_manipulate]