From eb263032bf335ecbf91da02a85f267b84d8afce6 Mon Sep 17 00:00:00 2001 From: Yosshi999 Date: Sun, 3 Nov 2024 16:19:56 +0900 Subject: [PATCH] move padding to generate_full_intermediate and margin to render_audio_segment --- crates/voicevox_core/src/synthesizer.rs | 234 +++++++++++++----------- 1 file changed, 129 insertions(+), 105 deletions(-) diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index b30d7c3ca..fb3cddc74 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -127,6 +127,15 @@ mod inner { use super::{AccelerationMode, AsyncForOnnxruntime, InitializeOptions, TtsOptions}; const DEFAULT_SAMPLING_RATE: u32 = 24000; + // 音が途切れてしまうのを避けるworkaround処理のためのパディング幅 + const PADDING_DURATION: f64 = 0.4; // パディング秒 + const PADDING_FRAME_LENGTH: usize = { + // パディングフレーム数 + const fn div_round(num: f64, den: f64) -> u32 { + ((num + den / 2.0) / den) as u32 + } + div_round(PADDING_DURATION * DEFAULT_SAMPLING_RATE as f64, 256.0) as _ + }; /// 音声の中間表現。 pub struct AudioFeature { @@ -138,8 +147,6 @@ mod inner { pub frame_length: usize, /// フレームレート。全体の秒数は`frame_length / frame_rate`で表せる。 pub frame_rate: f64, - /// workaroundとして付け足されているパディング長。 - padding_frame_length: usize, /// 生成時に利用したクエリ。 audio_query: AudioQuery, } @@ -373,28 +380,12 @@ mod inner { } } - // 音が途切れてしまうのを避けるworkaround処理が入っている - // NOTE: `render()`内でこのpaddingを取り除くために、padding_frame_lengthにpadding長を保持している。 - // TODO: 改善したらここのpadding処理を取り除く - const PADDING_SIZE: f64 = 0.4; - let padding_size = - ((PADDING_SIZE * DEFAULT_SAMPLING_RATE as f64) / 256.0).round() as usize; - let start_and_end_padding_size = 2 * padding_size; - let length_with_padding = f0.len() + start_and_end_padding_size; - let f0_with_padding = make_f0_with_padding(&f0, length_with_padding, padding_size); - let phoneme_with_padding = make_phoneme_with_padding( - phoneme.as_flattened(), - OjtPhoneme::num_phoneme(), - length_with_padding, - padding_size, - ); - let spec = self .generate_full_intermediate( - f0_with_padding.len(), + f0.len(), OjtPhoneme::num_phoneme(), - &f0_with_padding, - &phoneme_with_padding, + &f0, + &phoneme.as_flattened(), style_id, ) .await?; @@ -403,7 +394,6 @@ mod inner { style_id, frame_length: f0.len(), frame_rate: (DEFAULT_SAMPLING_RATE as f64) / 256.0, - padding_frame_length: padding_size, audio_query: audio_query.clone(), }); @@ -455,46 +445,6 @@ mod inner { pitch, } } - - fn make_f0_with_padding( - f0_slice: &[f32], - length_with_padding: usize, - padding_size: usize, - ) -> Vec { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let mut f0_with_padding = Vec::with_capacity(length_with_padding); - let padding = vec![0.0; padding_size]; - f0_with_padding.extend_from_slice(&padding); - f0_with_padding.extend_from_slice(f0_slice); - f0_with_padding.extend_from_slice(&padding); - f0_with_padding - } - - fn make_phoneme_with_padding( - phoneme_slice: &[f32], - phoneme_size: usize, - length_with_padding: usize, - padding_size: usize, - ) -> Vec { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let mut padding_phoneme = vec![0.0; phoneme_size]; - padding_phoneme[0] = 1.0; - let padding_phoneme_len = padding_phoneme.len(); - let padding_phonemes: Vec = padding_phoneme - .into_iter() - .cycle() - .take(padding_phoneme_len * padding_size) - .collect(); - let mut phoneme_with_padding = - Vec::with_capacity(phoneme_size * length_with_padding); - phoneme_with_padding.extend_from_slice(&padding_phonemes); - phoneme_with_padding.extend_from_slice(phoneme_slice); - phoneme_with_padding.extend_from_slice(&padding_phonemes); - - phoneme_with_padding - } } pub(super) async fn render( @@ -504,41 +454,10 @@ mod inner { end: usize, ) -> Result> { // TODO: 44.1kHzなどの対応 - const MARGIN: usize = 14; // 使われているHifiGANのreceptive fieldから計算される安全マージン - use std::cmp::min; - // 実態(workaround paddingを含まない)上での区間 - let clipped_start = min(start, audio.frame_length); - let clipped_end = min(end, audio.frame_length); - // 指定領域が空の区間だった場合、ONNXRuntimeに渡す前に早期リターン - if (clipped_start..clipped_end).is_empty() { - return Ok(vec![]); - } - // マージンがデータからはみ出さないことを保証 - // cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291 - if MARGIN > audio.padding_frame_length + clipped_start - || MARGIN > audio.padding_frame_length + (audio.frame_length - clipped_end) - { - unreachable!("Validation error: Too short padding for input, please report this issue on GitHub."); - } - let left_margin = MARGIN; - let right_margin = MARGIN; - // 安全マージンを追加したデータ上での区間 - let slice_start = audio.padding_frame_length + clipped_start - left_margin; - let slice_end = audio.padding_frame_length + clipped_end + right_margin; - let segment = audio - .internal_state - .slice(ndarray::s![slice_start..slice_end, ..]); - let wave_with_margin = self - .render_audio_segment(segment.into_owned(), audio.style_id) + let wave = self + .render_audio_segment(audio.internal_state.clone(), start, end, audio.style_id) .await?; - // 変換前に追加した安全マージンを生成音声から取り除く - let wave = wave_with_margin - .slice(ndarray::s![ - left_margin * 256..wave_with_margin.len() - right_margin * 256 - ]) - .into_owned() - .into_raw_vec(); - return Ok(to_s16le_pcm(&wave, &audio.audio_query)); + return Ok(to_s16le_pcm(&wave.into_raw_vec(), &audio.audio_query)); fn to_s16le_pcm( wave: &[f32], @@ -908,10 +827,12 @@ mod inner { pub(super) async fn render_audio_segment( &self, spec: ndarray::Array2, + start: usize, + end: usize, style_id: StyleId, ) -> Result> { let status = self.status.clone(); - A::unblock(move || status.render_audio_segment(spec, style_id)).await + A::unblock(move || status.render_audio_segment(spec, start, end, style_id)).await } pub(super) async fn decode( @@ -1000,7 +921,7 @@ mod inner { /// CPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。 fn generate_full_intermediate( &self, - length: usize, + _length: usize, phoneme_size: usize, f0: ndarray::Array1, phoneme_vector: ndarray::Array1, @@ -1008,26 +929,125 @@ mod inner { ) -> Result> { let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + // 音が途切れてしまうのを避けるworkaround処理が入っている + // NOTE: `render()`内でこのpaddingを取り除くために、padding_frame_lengthにpadding長を保持している。 + // TODO: 改善したらここのpadding処理を取り除く + let start_and_end_padding_size = 2 * PADDING_FRAME_LENGTH; + let length_with_padding = f0.len() + start_and_end_padding_size; + let f0_with_padding = + make_f0_with_padding(&f0.to_vec(), length_with_padding, PADDING_FRAME_LENGTH); + let phoneme_with_padding = make_phoneme_with_padding( + &phoneme_vector.to_vec(), + OjtPhoneme::num_phoneme(), + length_with_padding, + PADDING_FRAME_LENGTH, + ); + let GenerateFullIntermediateOutput { spec } = self.run_session( model_id, GenerateFullIntermediateInput { - f0: f0.into_shape([length, 1]).unwrap(), - phoneme: phoneme_vector.into_shape([length, phoneme_size]).unwrap(), + f0: ndarray::arr1(&f0_with_padding) + .into_shape([length_with_padding, 1]) + .unwrap(), + phoneme: ndarray::arr1(&phoneme_with_padding) + .into_shape([length_with_padding, phoneme_size]) + .unwrap(), speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), }, )?; - Ok(spec) + return Ok(spec); + + fn make_f0_with_padding( + f0_slice: &[f32], + length_with_padding: usize, + padding_size: usize, + ) -> Vec { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let mut f0_with_padding = Vec::with_capacity(length_with_padding); + let padding = vec![0.0; padding_size]; + f0_with_padding.extend_from_slice(&padding); + f0_with_padding.extend_from_slice(f0_slice); + f0_with_padding.extend_from_slice(&padding); + f0_with_padding + } + + fn make_phoneme_with_padding( + phoneme_slice: &[f32], + phoneme_size: usize, + length_with_padding: usize, + padding_size: usize, + ) -> Vec { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let mut padding_phoneme = vec![0.0; phoneme_size]; + padding_phoneme[0] = 1.0; + let padding_phoneme_len = padding_phoneme.len(); + let padding_phonemes: Vec = padding_phoneme + .into_iter() + .cycle() + .take(padding_phoneme_len * padding_size) + .collect(); + let mut phoneme_with_padding = + Vec::with_capacity(phoneme_size * length_with_padding); + phoneme_with_padding.extend_from_slice(&padding_phonemes); + phoneme_with_padding.extend_from_slice(phoneme_slice); + phoneme_with_padding.extend_from_slice(&padding_phonemes); + + phoneme_with_padding + } } /// CPU/GPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。 fn render_audio_segment( &self, spec: ndarray::Array2, + start: usize, + end: usize, style_id: StyleId, ) -> Result> { let (model_id, _inner_voice_id) = self.ids_for::(style_id)?; - let RenderAudioSegmentOutput { wave } = - self.run_session(model_id, RenderAudioSegmentInput { spec })?; + let frame_length_with_padding = spec.nrows(); + // 音が途切れてしまうのを避けるworkaround処理が入っている + let frame_length = frame_length_with_padding - PADDING_FRAME_LENGTH * 2; + + // [start, end) の区間を両端にマージンを追加した上で音声変換する + const MARGIN: usize = 14; // 使われているHifiGANのreceptive fieldから計算される安全マージン + use std::cmp::min; + // 実態(workaround paddingを含まない)上での区間 + let clipped_start = min(start, frame_length); + let clipped_end = min(end, frame_length); + // 指定領域が空の区間だった場合、ONNXRuntimeに渡す前に早期リターン + if (clipped_start..clipped_end).is_empty() { + return Ok(ndarray::arr1(&vec![])); + } + // マージンがデータからはみ出さないことを保証 + // cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291 + if MARGIN > PADDING_FRAME_LENGTH + clipped_start + || MARGIN > PADDING_FRAME_LENGTH + (frame_length - clipped_end) + { + unreachable!("Validation error: Too short padding for input, please report this issue on GitHub."); + } + let left_margin = MARGIN; + let right_margin = MARGIN; + // 安全マージンを追加したデータ上での区間 + let slice_start = PADDING_FRAME_LENGTH + clipped_start - left_margin; + let slice_end = PADDING_FRAME_LENGTH + clipped_end + right_margin; + let spec_segment = spec.slice(ndarray::s![slice_start..slice_end, ..]); + let RenderAudioSegmentOutput { + wave: wave_with_margin, + } = self.run_session( + model_id, + RenderAudioSegmentInput { + spec: spec_segment.into_owned(), + }, + )?; + // 変換前に追加した安全マージンを生成音声から取り除く + let wave = wave_with_margin + .slice(ndarray::s![ + left_margin * 256..wave_with_margin.len() - right_margin * 256 + ]) + .into_owned(); Ok(wave) } @@ -1047,7 +1067,7 @@ mod inner { phoneme_vector, style_id, )?; - let output = self.render_audio_segment(intermediate, style_id)?; + let output = self.render_audio_segment(intermediate, 0, length, style_id)?; Ok(output.into_raw_vec()) } } @@ -1565,9 +1585,13 @@ pub(crate) mod blocking { pub fn render_audio_segment( &self, spec: ndarray::Array2, + start: usize, + end: usize, style_id: StyleId, ) -> crate::Result> { - self.0.render_audio_segment(spec, style_id).block_on() + self.0 + .render_audio_segment(spec, start, end, style_id) + .block_on() } pub fn decode(