streaming decoder for compatible engine

VOICEVOX · Nov 20, 2024 · 0055457 · 0055457
1 parent b040aee
commit 0055457
Show file tree

Hide file tree

Showing 6 changed files with 141 additions and 12 deletions.
diff --git a/crates/test_util/build.rs b/crates/test_util/build.rs
@@ -202,6 +202,35 @@ fn generate_example_data_json(dist: &Path) -> anyhow::Result<()> {
                 phoneme.to_vec()
             },
         },
+        intermediate: typing::IntermediateExampleData {
+            f0_length: 69,
+            phoneme_size: 45,
+            feature_dim: 80,
+            margin_width: 14,
+            f0_vector: {
+                let mut f0 = [0.; 69];
+                f0[9..24].fill(5.905218);
+                f0[37..60].fill(5.565851);
+                f0.to_vec()
+            },
+            phoneme_vector: {
+                let mut phoneme = [0.; 45 * 69];
+                let mut set_one = |index, range| {
+                    for i in range {
+                        phoneme[(i * 45 + index) as usize] = 1.;
+                    }
+                };
+                set_one(0, 0..9);
+                set_one(37, 9..13);
+                set_one(14, 13..24);
+                set_one(35, 24..30);
+                set_one(6, 30..37);
+                set_one(37, 37..45);
+                set_one(30, 45..60);
+                set_one(0, 60..69);
+                phoneme.to_vec()
+            },
+        },
     };
 
     fs_err::write(

diff --git a/crates/test_util/compatible_engine.h b/crates/test_util/compatible_engine.h
@@ -25,4 +25,12 @@ bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list,
 bool decode_forward(int64_t length, int64_t phoneme_size, float *f0,
                     float *phoneme, int64_t *speaker_id, float *output);
 
+bool generate_full_intermediate(int64_t length, int64_t phoneme_size,
+                                int64_t margin_width, int64_t feature_dim,
+                                float *f0, float *phoneme, int64_t *speaker_id,
+                                float *output);
+
+bool render_audio_segment(int64_t length, int64_t feature_dim, float *audio_feature,
+                          int64_t *speaker_id, float *output);
+
 const char *last_error_message();
diff --git a/crates/test_util/src/typing.rs b/crates/test_util/src/typing.rs
@@ -31,11 +31,22 @@ pub struct DecodeExampleData {
     pub phoneme_vector: Vec<f32>,
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+pub struct IntermediateExampleData {
+    pub f0_length: i64,
+    pub phoneme_size: i64,
+    pub feature_dim: i64,
+    pub margin_width: i64,
+    pub f0_vector: Vec<f32>,
+    pub phoneme_vector: Vec<f32>,
+} 
+
 #[derive(Debug, Serialize, Deserialize)]
 pub struct ExampleData {
     pub speaker_id: i64,
 
     pub duration: DurationExampleData,
     pub intonation: IntonationExampleData,
     pub decode: DecodeExampleData,
+    pub intermediate: IntermediateExampleData,
 }
diff --git a/crates/voicevox_core_c_api/Cargo.toml b/crates/voicevox_core_c_api/Cargo.toml
@@ -30,6 +30,7 @@ easy-ext.workspace = true
 educe.workspace = true
 itertools.workspace = true
 libc.workspace = true
+ndarray.workspace = true
 parking_lot = { workspace = true, features = ["arc_lock"] }
 process_path.workspace = true
 ref-cast.workspace = true

diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs
@@ -364,16 +364,16 @@ pub unsafe extern "C" fn decode_forward(
 /// - `f0`はRustの`&[f32; length as usize]`として解釈できなければならない。
 /// - `phoneme`はRustの`&[f32; phoneme_size * length as usize]`として解釈できなければならない。
 /// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。
-/// - `output`はRustの`&mut [f32; (length + 2 * margin_width) * feature_dim as usize]`として解釈できなければならない。
+/// - `output`はRustの`&mut [f32; ((length + 2 * margin_width) * feature_dim) as usize]`として解釈できなければならない。
 #[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない
 pub unsafe extern "C" fn generate_full_intermediate(
     length: i64,
     phoneme_size: i64,
+    margin_width: i64,
+    feature_dim: i64,
     f0: *mut f32,
     phoneme: *mut f32,
     speaker_id: *mut i64,
-    margin_width: i64,
-    feature_dim: i64,
     output: *mut f32,
 ) -> bool {
     init_logger_once();
@@ -397,8 +397,10 @@ pub unsafe extern "C" fn generate_full_intermediate(
     match result {
         Ok(output_arr) => {
             // SAFETY: The safety contract must be upheld by the caller.
-            let output_slice = unsafe { std::slice::from_raw_parts_mut(output, (length + margin_width * 2) * feature_dim) };
-            output_slice.clone_from_slice(&output_arr.to_vec());
+            let output_slice = unsafe {
+                std::slice::from_raw_parts_mut(output, (length + 2 * margin_width) * feature_dim)
+            };
+            output_slice.clone_from_slice(&output_arr.into_raw_vec());
             true
         }
         Err(err) => {
@@ -410,28 +412,30 @@ pub unsafe extern "C" fn generate_full_intermediate(
 
 /// # Safety
 ///
-/// - `audio_feature`はRustの`&[f32; (length + 2 * margin_width) * feature_dim as usize]`として解釈できなければならない。
+/// - `audio_feature`はRustの`&[f32; (length * feature_dim) as usize]`として解釈できなければならない。
 /// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。
 /// - `output`はRustの`&mut [f32; length as usize * 256]`として解釈できなければならない。
 #[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない
 pub unsafe extern "C" fn render_audio_segment(
     length: i64,
+    feature_dim: i64,
     audio_feature: *mut f32,
     speaker_id: *mut i64,
-    margin_width: i64,
-    feature_dim: i64,
     output: *mut f32,
 ) -> bool {
     init_logger_once();
     assert_aligned(audio_feature);
     assert_aligned(speaker_id);
     let length = length as usize;
-    let margin_width = margin_width as usize;
     let feature_dim = feature_dim as usize;
     let synthesizer = &*lock_synthesizer();
+    // SAFETY: The safety contract must be upheld by the caller.
+    let audio_feature_vec =
+        unsafe { std::slice::from_raw_parts(audio_feature, length * feature_dim) };
     let result = ensure_initialized!(synthesizer).render_audio_segment(
-        // SAFETY: The safety contract must be upheld by the caller.
-        unsafe { std::slice::from_raw_parts(audio_feature, (length + 2 * margin_width) * feature_dim) },
+        ndarray::arr1(audio_feature_vec)
+            .into_shape([length, feature_dim])
+            .unwrap(),
         StyleId::new(unsafe { *speaker_id as u32 }),
     );
     match result {

diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs
@@ -1,7 +1,7 @@
 // エンジンを起動してyukarin_s・yukarin_sa・decodeの推論を行う
 
-use std::ffi::CStr;
 use std::sync::LazyLock;
+use std::{cmp::min, ffi::CStr};
 
 use assert_cmd::assert::AssertResult;
 use libloading::Library;
@@ -83,12 +83,88 @@ impl assert_cdylib::TestCase for TestCase {
             wave
         };
 
+        // 中間生成物を経由した場合の生成音声
+        let wave2 = {
+            let length_with_margin =
+                EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width;
+            let mut audio_feature =
+                vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize];
+            let mut wave = vec![0.; 256 * length_with_margin as usize];
+            assert!(lib.generate_full_intermediate(
+                EXAMPLE_DATA.intermediate.f0_length,
+                EXAMPLE_DATA.intermediate.phoneme_size,
+                EXAMPLE_DATA.intermediate.margin_width,
+                EXAMPLE_DATA.intermediate.feature_dim,
+                EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32,
+                EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32,
+                &mut { EXAMPLE_DATA.speaker_id } as *mut i64,
+                audio_feature.as_mut_ptr(),
+            ));
+            assert!(lib.render_audio_segment(
+                length_with_margin,
+                EXAMPLE_DATA.intermediate.feature_dim,
+                audio_feature.as_ptr() as *mut f32,
+                &mut { EXAMPLE_DATA.speaker_id } as *mut i64,
+                wave.as_mut_ptr(),
+            ));
+            wave[256 * EXAMPLE_DATA.intermediate.margin_width as usize
+                ..wave.len() - 256 * EXAMPLE_DATA.intermediate.margin_width as usize]
+                .to_vec()
+        };
+
+        // 中間生成物を経由し、さらにチャンクごとに変換した場合の生成音声
+        let wave3 = {
+            let length_with_margin =
+                EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width;
+            let mut audio_feature =
+                vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize];
+            let mut wave = vec![0.; 256 * EXAMPLE_DATA.intermediate.f0_length as usize];
+            assert!(lib.generate_full_intermediate(
+                EXAMPLE_DATA.intermediate.f0_length,
+                EXAMPLE_DATA.intermediate.phoneme_size,
+                EXAMPLE_DATA.intermediate.margin_width,
+                EXAMPLE_DATA.intermediate.feature_dim,
+                EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32,
+                EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32,
+                &mut { EXAMPLE_DATA.speaker_id } as *mut i64,
+                audio_feature.as_mut_ptr(),
+            ));
+            let full_length = EXAMPLE_DATA.intermediate.f0_length as usize;
+            let pitch = EXAMPLE_DATA.intermediate.feature_dim as usize;
+            for render_start in (0..full_length).step_by(10) {
+                // render_start .. render_end の音声を取得する
+                let render_end = min(render_start + 10, full_length);
+                let slice_start = render_start;
+                let slice_end = render_end + 2 * EXAMPLE_DATA.intermediate.margin_width as usize;
+                let feature_segment = &audio_feature[slice_start * pitch..slice_end * pitch];
+                let slice_length = slice_end - slice_start;
+                let mut wave_segment_with_margin = vec![0.; 256 * slice_length];
+                assert!(lib.render_audio_segment(
+                    slice_length as i64,
+                    pitch as i64,
+                    feature_segment.as_ptr() as *mut f32,
+                    &mut { EXAMPLE_DATA.speaker_id } as *mut i64,
+                    wave_segment_with_margin.as_mut_ptr(),
+                ));
+                let wave_segment = &wave_segment_with_margin[256
+                    * EXAMPLE_DATA.intermediate.margin_width as usize
+                    ..wave_segment_with_margin.len()
+                        - 256 * EXAMPLE_DATA.intermediate.margin_width as usize];
+                wave[render_start * 256..render_end * 256].clone_from_slice(wave_segment);
+            }
+            wave
+        };
+
         std::assert_eq!(SNAPSHOTS.metas, metas_json);
 
         float_assert::close_l1(&phoneme_length, &EXAMPLE_DATA.duration.result, 0.01);
         float_assert::close_l1(&intonation_list, &EXAMPLE_DATA.intonation.result, 0.01);
 
         assert!(wave.iter().copied().all(f32::is_normal));
+        assert!(wave2.iter().copied().all(f32::is_normal));
+        assert!(wave3.iter().copied().all(f32::is_normal));
+        float_assert::close_l1(&wave2, &wave, 0.01);
+        float_assert::close_l1(&wave3, &wave, 0.01);
 
         lib.finalize();
         Ok(())