Skip to content

Commit

Permalink
streaming decoder for compatible engine
Browse files Browse the repository at this point in the history
  • Loading branch information
Yosshi999 committed Nov 20, 2024
1 parent b040aee commit 0055457
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 12 deletions.
29 changes: 29 additions & 0 deletions crates/test_util/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,35 @@ fn generate_example_data_json(dist: &Path) -> anyhow::Result<()> {
phoneme.to_vec()
},
},
intermediate: typing::IntermediateExampleData {
f0_length: 69,
phoneme_size: 45,
feature_dim: 80,
margin_width: 14,
f0_vector: {
let mut f0 = [0.; 69];
f0[9..24].fill(5.905218);
f0[37..60].fill(5.565851);
f0.to_vec()
},
phoneme_vector: {
let mut phoneme = [0.; 45 * 69];
let mut set_one = |index, range| {
for i in range {
phoneme[(i * 45 + index) as usize] = 1.;
}
};
set_one(0, 0..9);
set_one(37, 9..13);
set_one(14, 13..24);
set_one(35, 24..30);
set_one(6, 30..37);
set_one(37, 37..45);
set_one(30, 45..60);
set_one(0, 60..69);
phoneme.to_vec()
},
},
};

fs_err::write(
Expand Down
8 changes: 8 additions & 0 deletions crates/test_util/compatible_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,12 @@ bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list,
bool decode_forward(int64_t length, int64_t phoneme_size, float *f0,
float *phoneme, int64_t *speaker_id, float *output);

bool generate_full_intermediate(int64_t length, int64_t phoneme_size,
int64_t margin_width, int64_t feature_dim,
float *f0, float *phoneme, int64_t *speaker_id,
float *output);

bool render_audio_segment(int64_t length, int64_t feature_dim, float *audio_feature,
int64_t *speaker_id, float *output);

const char *last_error_message();
11 changes: 11 additions & 0 deletions crates/test_util/src/typing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,22 @@ pub struct DecodeExampleData {
pub phoneme_vector: Vec<f32>,
}

#[derive(Debug, Serialize, Deserialize)]
pub struct IntermediateExampleData {
pub f0_length: i64,
pub phoneme_size: i64,
pub feature_dim: i64,
pub margin_width: i64,
pub f0_vector: Vec<f32>,
pub phoneme_vector: Vec<f32>,
}

#[derive(Debug, Serialize, Deserialize)]
pub struct ExampleData {
pub speaker_id: i64,

pub duration: DurationExampleData,
pub intonation: IntonationExampleData,
pub decode: DecodeExampleData,
pub intermediate: IntermediateExampleData,
}
1 change: 1 addition & 0 deletions crates/voicevox_core_c_api/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ easy-ext.workspace = true
educe.workspace = true
itertools.workspace = true
libc.workspace = true
ndarray.workspace = true
parking_lot = { workspace = true, features = ["arc_lock"] }
process_path.workspace = true
ref-cast.workspace = true
Expand Down
26 changes: 15 additions & 11 deletions crates/voicevox_core_c_api/src/compatible_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,16 +364,16 @@ pub unsafe extern "C" fn decode_forward(
/// - `f0`はRustの`&[f32; length as usize]`として解釈できなければならない。
/// - `phoneme`はRustの`&[f32; phoneme_size * length as usize]`として解釈できなければならない。
/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。
/// - `output`はRustの`&mut [f32; (length + 2 * margin_width) * feature_dim as usize]`として解釈できなければならない。
/// - `output`はRustの`&mut [f32; ((length + 2 * margin_width) * feature_dim) as usize]`として解釈できなければならない。
#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない
pub unsafe extern "C" fn generate_full_intermediate(
length: i64,
phoneme_size: i64,
margin_width: i64,
feature_dim: i64,
f0: *mut f32,
phoneme: *mut f32,
speaker_id: *mut i64,
margin_width: i64,
feature_dim: i64,
output: *mut f32,
) -> bool {
init_logger_once();
Expand All @@ -397,8 +397,10 @@ pub unsafe extern "C" fn generate_full_intermediate(
match result {
Ok(output_arr) => {
// SAFETY: The safety contract must be upheld by the caller.
let output_slice = unsafe { std::slice::from_raw_parts_mut(output, (length + margin_width * 2) * feature_dim) };
output_slice.clone_from_slice(&output_arr.to_vec());
let output_slice = unsafe {
std::slice::from_raw_parts_mut(output, (length + 2 * margin_width) * feature_dim)
};
output_slice.clone_from_slice(&output_arr.into_raw_vec());
true
}
Err(err) => {
Expand All @@ -410,28 +412,30 @@ pub unsafe extern "C" fn generate_full_intermediate(

/// # Safety
///
/// - `audio_feature`はRustの`&[f32; (length + 2 * margin_width) * feature_dim as usize]`として解釈できなければならない。
/// - `audio_feature`はRustの`&[f32; (length * feature_dim) as usize]`として解釈できなければならない。
/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。
/// - `output`はRustの`&mut [f32; length as usize * 256]`として解釈できなければならない。
#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない
pub unsafe extern "C" fn render_audio_segment(
length: i64,
feature_dim: i64,
audio_feature: *mut f32,
speaker_id: *mut i64,
margin_width: i64,
feature_dim: i64,
output: *mut f32,
) -> bool {
init_logger_once();
assert_aligned(audio_feature);
assert_aligned(speaker_id);
let length = length as usize;
let margin_width = margin_width as usize;
let feature_dim = feature_dim as usize;
let synthesizer = &*lock_synthesizer();
// SAFETY: The safety contract must be upheld by the caller.
let audio_feature_vec =
unsafe { std::slice::from_raw_parts(audio_feature, length * feature_dim) };
let result = ensure_initialized!(synthesizer).render_audio_segment(
// SAFETY: The safety contract must be upheld by the caller.
unsafe { std::slice::from_raw_parts(audio_feature, (length + 2 * margin_width) * feature_dim) },
ndarray::arr1(audio_feature_vec)
.into_shape([length, feature_dim])
.unwrap(),
StyleId::new(unsafe { *speaker_id as u32 }),
);
match result {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// エンジンを起動してyukarin_s・yukarin_sa・decodeの推論を行う

use std::ffi::CStr;
use std::sync::LazyLock;
use std::{cmp::min, ffi::CStr};

use assert_cmd::assert::AssertResult;
use libloading::Library;
Expand Down Expand Up @@ -83,12 +83,88 @@ impl assert_cdylib::TestCase for TestCase {
wave
};

// 中間生成物を経由した場合の生成音声
let wave2 = {
let length_with_margin =
EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width;
let mut audio_feature =
vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize];
let mut wave = vec![0.; 256 * length_with_margin as usize];
assert!(lib.generate_full_intermediate(
EXAMPLE_DATA.intermediate.f0_length,
EXAMPLE_DATA.intermediate.phoneme_size,
EXAMPLE_DATA.intermediate.margin_width,
EXAMPLE_DATA.intermediate.feature_dim,
EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32,
EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
audio_feature.as_mut_ptr(),
));
assert!(lib.render_audio_segment(
length_with_margin,
EXAMPLE_DATA.intermediate.feature_dim,
audio_feature.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
wave.as_mut_ptr(),
));
wave[256 * EXAMPLE_DATA.intermediate.margin_width as usize
..wave.len() - 256 * EXAMPLE_DATA.intermediate.margin_width as usize]
.to_vec()
};

// 中間生成物を経由し、さらにチャンクごとに変換した場合の生成音声
let wave3 = {
let length_with_margin =
EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width;
let mut audio_feature =
vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize];
let mut wave = vec![0.; 256 * EXAMPLE_DATA.intermediate.f0_length as usize];
assert!(lib.generate_full_intermediate(
EXAMPLE_DATA.intermediate.f0_length,
EXAMPLE_DATA.intermediate.phoneme_size,
EXAMPLE_DATA.intermediate.margin_width,
EXAMPLE_DATA.intermediate.feature_dim,
EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32,
EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
audio_feature.as_mut_ptr(),
));
let full_length = EXAMPLE_DATA.intermediate.f0_length as usize;
let pitch = EXAMPLE_DATA.intermediate.feature_dim as usize;
for render_start in (0..full_length).step_by(10) {
// render_start .. render_end の音声を取得する
let render_end = min(render_start + 10, full_length);
let slice_start = render_start;
let slice_end = render_end + 2 * EXAMPLE_DATA.intermediate.margin_width as usize;
let feature_segment = &audio_feature[slice_start * pitch..slice_end * pitch];
let slice_length = slice_end - slice_start;
let mut wave_segment_with_margin = vec![0.; 256 * slice_length];
assert!(lib.render_audio_segment(
slice_length as i64,
pitch as i64,
feature_segment.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
wave_segment_with_margin.as_mut_ptr(),
));
let wave_segment = &wave_segment_with_margin[256
* EXAMPLE_DATA.intermediate.margin_width as usize
..wave_segment_with_margin.len()
- 256 * EXAMPLE_DATA.intermediate.margin_width as usize];
wave[render_start * 256..render_end * 256].clone_from_slice(wave_segment);
}
wave
};

std::assert_eq!(SNAPSHOTS.metas, metas_json);

float_assert::close_l1(&phoneme_length, &EXAMPLE_DATA.duration.result, 0.01);
float_assert::close_l1(&intonation_list, &EXAMPLE_DATA.intonation.result, 0.01);

assert!(wave.iter().copied().all(f32::is_normal));
assert!(wave2.iter().copied().all(f32::is_normal));
assert!(wave3.iter().copied().all(f32::is_normal));
float_assert::close_l1(&wave2, &wave, 0.01);
float_assert::close_l1(&wave3, &wave, 0.01);

lib.finalize();
Ok(())
Expand Down

0 comments on commit 0055457

Please sign in to comment.