ストリーミングモードのdecodeを実装（precompute_renderとrender） (#854)

この本文は @qryxip が記述している。 #851 で生まれた`generate_full_intermediate`と`render_audio_segment`を用いて次の公開APIを作る。`precompute_render`で`AudioFeature`を生成し、 `AudioFeature`と区間指定を引数とした`render`で指定区間のPCMを生成する形。 - `voicevox_core::blocking::Synthesizer::precompute_render` - `voicevox_core::blocking::Synthesizer::render` - `voicevox_core::blocking::AudioFeature` また`render`で生成したPCMをWAVとして組み立てるため、次の公開APIも作る。 - `voicevox_core::wav_from_s16le` ただしこのPRで実装するのはRust APIとPython APIのみ。非同期API、C API、 Java APIについては今後実装する。Python APIのtype stubも今後用意する。またテストも今後書く。 Refs: #853 Co-authored-by: Ryo Yamashita <[email protected]> Co-authored-by: Hiroshiba <[email protected]> Co-authored-by: Nanashi. <[email protected]>
VOICEVOX · Oct 29, 2024 · 5641e37 · 5641e37
1 parent fdae73a
commit 5641e37
Show file tree

Hide file tree

Showing 9 changed files with 352 additions and 119 deletions.
diff --git a/crates/voicevox_core/src/blocking.rs b/crates/voicevox_core/src/blocking.rs
@@ -2,8 +2,8 @@
 
 pub use crate::{
     engine::open_jtalk::blocking::OpenJtalk, infer::runtimes::onnxruntime::blocking::Onnxruntime,
-    synthesizer::blocking::Synthesizer, user_dict::dict::blocking::UserDict,
-    voice_model::blocking::VoiceModelFile,
+    synthesizer::blocking::AudioFeature, synthesizer::blocking::Synthesizer,
+    user_dict::dict::blocking::UserDict, voice_model::blocking::VoiceModelFile,
 };
 
 pub mod onnxruntime {

diff --git a/crates/voicevox_core/src/engine/audio_file.rs b/crates/voicevox_core/src/engine/audio_file.rs
@@ -0,0 +1,32 @@
+use std::io::{Cursor, Write as _};
+
+/// 16bit PCMにヘッダを付加しWAVフォーマットのバイナリを生成する。
+pub fn wav_from_s16le(pcm: &[u8], sampling_rate: u32, is_stereo: bool) -> Vec<u8> {
+    let num_channels: u16 = if is_stereo { 2 } else { 1 };
+    let bit_depth: u16 = 16;
+    let block_size: u16 = bit_depth * num_channels / 8;
+
+    let bytes_size = pcm.len() as u32;
+    let wave_size = bytes_size + 44;
+
+    let buf: Vec<u8> = Vec::with_capacity(wave_size as usize);
+    let mut cur = Cursor::new(buf);
+
+    cur.write_all("RIFF".as_bytes()).unwrap();
+    cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap();
+    cur.write_all("WAVEfmt ".as_bytes()).unwrap();
+    cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length
+    cur.write_all(&1_u16.to_le_bytes()).unwrap(); // linear PCM
+    cur.write_all(&num_channels.to_le_bytes()).unwrap();
+    cur.write_all(&sampling_rate.to_le_bytes()).unwrap();
+
+    let block_rate = sampling_rate * block_size as u32;
+
+    cur.write_all(&block_rate.to_le_bytes()).unwrap();
+    cur.write_all(&block_size.to_le_bytes()).unwrap();
+    cur.write_all(&bit_depth.to_le_bytes()).unwrap();
+    cur.write_all("data".as_bytes()).unwrap();
+    cur.write_all(&bytes_size.to_le_bytes()).unwrap();
+    cur.write_all(pcm).unwrap();
+    cur.into_inner()
+}
diff --git a/crates/voicevox_core/src/engine/mod.rs b/crates/voicevox_core/src/engine/mod.rs
@@ -1,11 +1,13 @@
 mod acoustic_feature_extractor;
+mod audio_file;
 mod full_context_label;
 mod kana_parser;
 mod model;
 mod mora_list;
 pub(crate) mod open_jtalk;
 
 pub(crate) use self::acoustic_feature_extractor::OjtPhoneme;
+pub use self::audio_file::wav_from_s16le;
 pub(crate) use self::full_context_label::{
     extract_full_context_label, mora_to_text, FullContextLabelError,
 };

diff --git a/crates/voicevox_core/src/lib.rs b/crates/voicevox_core/src/lib.rs
@@ -83,7 +83,7 @@ use rstest_reuse;
 
 pub use self::{
     devices::SupportedDevices,
-    engine::{AccentPhrase, AudioQuery, FullcontextExtractor, Mora},
+    engine::{wav_from_s16le, AccentPhrase, AudioQuery, FullcontextExtractor, Mora},
     error::{Error, ErrorKind},
     metas::{
         RawStyleId, RawStyleVersion, SpeakerMeta, StyleId, StyleMeta, StyleType, StyleVersion,