From 101b3e7a0543e30da808f0eb0654c9e9f555f6b2 Mon Sep 17 00:00:00 2001 From: Yosshi999 Date: Sat, 9 Nov 2024 01:33:57 +0900 Subject: [PATCH] apply suggestion, explicit panic in out of range rendering --- crates/voicevox_core/src/synthesizer.rs | 117 ++++++++------------- crates/voicevox_core_python_api/src/lib.rs | 9 ++ example/python/run.py | 2 +- 3 files changed, 52 insertions(+), 76 deletions(-) diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 6ce79bc98..e027e3303 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -100,6 +100,7 @@ mod inner { use std::{ io::{Cursor, Write as _}, marker::PhantomData, + ops::Range, sync::Arc, }; use tracing::info; @@ -127,41 +128,29 @@ mod inner { use super::{AccelerationMode, AsyncForOnnxruntime, InitializeOptions, TtsOptions}; const DEFAULT_SAMPLING_RATE: u32 = 24000; - /// 音が途切れてしまうのを避けるworkaround処理のためのパディング幅(秒) - const PADDING_DURATION: f64 = 0.4; - /// 音が途切れてしまうのを避けるworkaround処理のためのパディング幅(f0フレーム数) - const PADDING_FRAME_LENGTH: usize = { - const fn div_round(num: f64, den: f64) -> u32 { - ((num + den / 2.0) / den) as u32 - } - div_round(PADDING_DURATION * DEFAULT_SAMPLING_RATE as f64, 256.0) as _ - }; + /// 音が途切れてしまうのを避けるworkaround処理のためのパディング幅(フレーム数) + const PADDING_FRAME_LENGTH: usize = 38; // (0.4秒 * 24000Hz / 256.0).round() /// 音声特徴量の一部分を変換する際に左右それぞれに確保すべきマージン幅(f0フレーム数) /// 使われているHifiGANのreceptive fieldから計算される const MARGIN: usize = 14; /// [start, end) の音声区間に対応する特徴量を両端にマージンを追加した上で切り出す - fn crop_feat_with_margin( - spec_with_margin: &ndarray::Array2, - start: usize, - end: usize, - ) -> Result> { - use std::cmp::min; - let frame_length_with_margin = spec_with_margin.nrows(); - // 音が途切れてしまうのを避けるworkaround処理が入っている - let frame_length = frame_length_with_margin - MARGIN * 2; - // 実態(workaround paddingを含まない)上での区間 - let clipped_start = min(start, frame_length); - let clipped_end = min(end, frame_length); - // 左右に付加されたマージン幅は常に`MARGIN`である - // 安全マージンを追加したデータ上での区間 - let slice_start = MARGIN + clipped_start - MARGIN; - let slice_end = MARGIN + clipped_end + MARGIN; - Ok(spec_with_margin.slice(ndarray::s![slice_start..slice_end, ..])) + fn crop_with_margin( + audio: &AudioFeature, + range: Range, + ) -> Result> { + if range.start > audio.frame_length || range.end > audio.frame_length { + panic!( + "{range:?} is out of range for audio feature of length {frame_length}", + frame_length = audio.frame_length, + ); + } + let range = range.start..range.end + 2 * MARGIN; + Ok(audio.internal_state.slice(ndarray::s![range, ..])) } /// 変換前に追加した安全マージンを生成音声から取り除く fn trim_margin_from_wave( wave_with_margin: &ndarray::Array1, - ) -> Result> { + ) -> Result> { let wave = wave_with_margin.slice(ndarray::s![ MARGIN * 256..wave_with_margin.len() - MARGIN * 256 ]); @@ -485,7 +474,7 @@ mod inner { end: usize, ) -> Result> { // TODO: 44.1kHzなどの対応 - let spec_segment = crop_feat_with_margin(&audio.internal_state, start, end)?; + let spec_segment = crop_with_margin(&audio, start..end)?; let wave_with_margin = self .render_audio_segment(spec_segment.to_owned(), audio.style_id) .await?; @@ -859,11 +848,11 @@ mod inner { pub(super) async fn render_audio_segment( &self, - spec_segment: ndarray::Array2, + spec: ndarray::Array2, style_id: StyleId, ) -> Result> { let status = self.status.clone(); - A::unblock(move || status.render_audio_segment(spec_segment, style_id)).await + A::unblock(move || status.render_audio_segment(spec, style_id)).await } pub(super) async fn decode( @@ -950,10 +939,11 @@ mod inner { } /// 無音パディングを付加して音声特徴量を計算し、マージン込みの音声特徴量を返す。 + /// /// CPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。 fn generate_full_intermediate( &self, - _length: usize, + length: usize, phoneme_size: usize, f0: ndarray::Array1, phoneme_vector: ndarray::Array1, @@ -966,12 +956,9 @@ mod inner { // TODO: 改善したらここのpadding処理を取り除く let start_and_end_padding_size = 2 * PADDING_FRAME_LENGTH; let length_with_padding = f0.len() + start_and_end_padding_size; - let f0_with_padding = - make_f0_with_padding(&f0.to_vec(), length_with_padding, PADDING_FRAME_LENGTH); + let f0_with_padding = make_f0_with_padding(f0, PADDING_FRAME_LENGTH); let phoneme_with_padding = make_phoneme_with_padding( - &phoneme_vector.to_vec(), - OjtPhoneme::num_phoneme(), - length_with_padding, + phoneme_vector.into_shape([length, phoneme_size]).unwrap(), PADDING_FRAME_LENGTH, ); @@ -980,22 +967,20 @@ mod inner { } = self.run_session( model_id, GenerateFullIntermediateInput { - f0: ndarray::arr1(&f0_with_padding) + f0: f0_with_padding .into_shape([length_with_padding, 1]) .unwrap(), - phoneme: ndarray::arr1(&phoneme_with_padding) - .into_shape([length_with_padding, phoneme_size]) - .unwrap(), + phoneme: phoneme_with_padding, speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), }, )?; - // マージン分を両端に残して音声特徴量を返す // マージンがデータからはみ出さないことを保証 // cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291 if MARGIN > PADDING_FRAME_LENGTH { unreachable!("Validation error: Too short padding for input, please report this issue on GitHub."); } + // マージン分を両端に残して音声特徴量を返す return Ok(spec_with_padding .slice(ndarray::s![ PADDING_FRAME_LENGTH - MARGIN @@ -1005,43 +990,27 @@ mod inner { .to_owned()); fn make_f0_with_padding( - f0_slice: &[f32], - length_with_padding: usize, + f0_slice: ndarray::Array1, padding_size: usize, - ) -> Vec { + ) -> ndarray::Array1 { // 音が途切れてしまうのを避けるworkaround処理 // 改善したらこの関数を削除する - let mut f0_with_padding = Vec::with_capacity(length_with_padding); - let padding = vec![0.0; padding_size]; - f0_with_padding.extend_from_slice(&padding); - f0_with_padding.extend_from_slice(f0_slice); - f0_with_padding.extend_from_slice(&padding); - f0_with_padding + let padding = ndarray::Array1::::zeros(padding_size); + ndarray::concatenate![ndarray::Axis(0), padding, f0_slice, padding] } fn make_phoneme_with_padding( - phoneme_slice: &[f32], - phoneme_size: usize, - length_with_padding: usize, + phoneme_slice: ndarray::Array2, padding_size: usize, - ) -> Vec { + ) -> ndarray::Array2 { // 音が途切れてしまうのを避けるworkaround処理 // 改善したらこの関数を削除する - let mut padding_phoneme = vec![0.0; phoneme_size]; - padding_phoneme[0] = 1.0; - let padding_phoneme_len = padding_phoneme.len(); - let padding_phonemes: Vec = padding_phoneme - .into_iter() - .cycle() - .take(padding_phoneme_len * padding_size) - .collect(); - let mut phoneme_with_padding = - Vec::with_capacity(phoneme_size * length_with_padding); - phoneme_with_padding.extend_from_slice(&padding_phonemes); - phoneme_with_padding.extend_from_slice(phoneme_slice); - phoneme_with_padding.extend_from_slice(&padding_phonemes); - - phoneme_with_padding + let mut padding = + ndarray::Array2::::zeros((padding_size, phoneme_slice.ncols())); + padding + .slice_mut(ndarray::s![.., 0]) + .assign(&ndarray::arr0(1.0)); + ndarray::concatenate![ndarray::Axis(0), padding, phoneme_slice, padding] } } @@ -1049,12 +1018,12 @@ mod inner { /// CPU/GPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。 fn render_audio_segment( &self, - spec_segment: ndarray::Array2, + spec: ndarray::Array2, style_id: StyleId, ) -> Result> { let (model_id, _inner_voice_id) = self.ids_for::(style_id)?; let RenderAudioSegmentOutput { wave } = - self.run_session(model_id, RenderAudioSegmentInput { spec: spec_segment })?; + self.run_session(model_id, RenderAudioSegmentInput { spec })?; Ok(wave) } @@ -1592,12 +1561,10 @@ pub(crate) mod blocking { pub fn render_audio_segment( &self, - spec_segment: ndarray::Array2, + spec: ndarray::Array2, style_id: StyleId, ) -> crate::Result> { - self.0 - .render_audio_segment(spec_segment, style_id) - .block_on() + self.0.render_audio_segment(spec, style_id).block_on() } pub fn decode( diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs index a2d1c2475..3f3a51cb1 100644 --- a/crates/voicevox_core_python_api/src/lib.rs +++ b/crates/voicevox_core_python_api/src/lib.rs @@ -282,6 +282,7 @@ mod blocking { use camino::Utf8PathBuf; use pyo3::{ + exceptions::PyIndexError, pyclass, pymethods, types::{IntoPyDict as _, PyBytes, PyDict, PyList}, Py, PyAny, PyObject, PyRef, PyResult, Python, @@ -709,6 +710,14 @@ mod blocking { end: usize, py: Python<'py>, ) -> PyResult<&'py PyBytes> { + if start > audio.frame_length() || end > audio.frame_length() { + return Err(PyIndexError::new_err(format!( + "({}, {}) is out of range for audio feature of length {}", + start, + end, + audio.frame_length(), + ))); + } let wav = &self .synthesizer .read()? diff --git a/example/python/run.py b/example/python/run.py index 17f7e688f..bfde292b5 100644 --- a/example/python/run.py +++ b/example/python/run.py @@ -58,7 +58,7 @@ def main() -> None: pcm = b"" for i in range(0, intermediate.frame_length, chunk_frames): logger.info("%s", f"{i/intermediate.frame_length:.2%}") - pcm += synthesizer.render(intermediate, i, i + chunk_frames) + pcm += synthesizer.render(intermediate, i, min(i + chunk_frames, intermediate.frame_length)) logger.info("%s", f"100%") wav = wav_from_s16le( pcm, audio_query.output_sampling_rate, audio_query.output_stereo