From 90e973056b7802e476e1b4b66814d1440856ce5b Mon Sep 17 00:00:00 2001 From: Hiroshiba Kazuyuki Date: Tue, 6 Mar 2018 00:09:09 +0900 Subject: [PATCH 1/2] wip --- become_yukarin/voice_changer.py | 272 +++++++++++++++++++++++++----- scripts/realtime_voice_changer.py | 153 ++++++++++++----- tests/test_voice_changer.py | 38 ++++- 3 files changed, 368 insertions(+), 95 deletions(-) diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py index 5e0eac0..3b75fb1 100644 --- a/become_yukarin/voice_changer.py +++ b/become_yukarin/voice_changer.py @@ -15,7 +15,6 @@ def __init__( self, acoustic_converter: AcousticConverter, super_resolution: SuperResolution, - vocoder: Vocoder, output_sampling_rate: int = None, ) -> None: if output_sampling_rate is None: @@ -23,18 +22,17 @@ def __init__( self.acoustic_converter = acoustic_converter self.super_resolution = super_resolution - self.vocoder = vocoder self.output_sampling_rate = output_sampling_rate - def convert_from_wave_path(self, wave_path: str): - w_in = self.acoustic_converter._wave_process(wave_path) - return self.convert_from_wave(w_in) - - def convert_from_wave(self, wave: Wave): - f_in = self.acoustic_converter._feature_process(wave) - f_high = self.convert_from_acoustic_feature(f_in) - wave = self.vocoder.decode(f_high) - return wave + # def convert_from_wave_path(self, wave_path: str): + # w_in = self.acoustic_converter._wave_process(wave_path) + # return self.convert_from_wave(w_in) + # + # def convert_from_wave(self, wave: Wave): + # f_in = self.acoustic_converter._feature_process(wave) + # f_high = self.convert_from_acoustic_feature(f_in) + # wave = self.vocoder.decode(f_high) + # return wave def convert_from_acoustic_feature(self, f_in: AcousticFeature): f_low = self.acoustic_converter.convert_to_feature(f_in) @@ -43,6 +41,20 @@ def convert_from_acoustic_feature(self, f_in: AcousticFeature): return f_high +class FeatureSegment(NamedTuple): + start_time: float + feature: AcousticFeature + frame_period: float + + @property + def time_length(self): + return len(self.feature.f0) * self.frame_period / 1000 + + @property + def end_time(self): + return self.time_length + self.start_time + + class Segment(NamedTuple): start_time: float wave: Wave @@ -59,18 +71,19 @@ def end_time(self): class VoiceChangerStream(object): def __init__( self, - voice_changer: VoiceChanger, sampling_rate: int, + frame_period: float, in_dtype=numpy.float32, ): - self.voice_changer = voice_changer self.sampling_rate = sampling_rate + self.frame_period = frame_period self.in_dtype = in_dtype - self._data_stream = [] # type: List[Segment] - @property - def vocoder(self): - return self.voice_changer.vocoder + self.voice_changer: VoiceChanger = None + self.vocoder: Vocoder = None + self._data_stream = [] # type: List[Segment] + self._in_feature_stream = [] # type: List[FeatureSegment] + self._out_feature_stream = [] # type: List[FeatureSegment] def add_wave(self, start_time: float, wave: Wave): # validation @@ -80,10 +93,30 @@ def add_wave(self, start_time: float, wave: Wave): segment = Segment(start_time=start_time, wave=wave) self._data_stream.append(segment) - def remove_wave(self, end_time: float): + def add_in_feature(self, start_time: float, feature: AcousticFeature, frame_period: float): + # validation + assert frame_period == self.frame_period + assert feature.f0.dtype == self.in_dtype + + segment = FeatureSegment(start_time=start_time, feature=feature, frame_period=self.frame_period) + self._in_feature_stream.append(segment) + + def add_out_feature(self, start_time: float, feature: AcousticFeature, frame_period: float): + # validation + assert frame_period == self.frame_period + + segment = FeatureSegment(start_time=start_time, feature=feature, frame_period=self.frame_period) + self._out_feature_stream.append(segment) + + def remove(self, end_time: float): self._data_stream = list(filter(lambda s: s.end_time > end_time, self._data_stream)) + self._in_feature_stream = list(filter(lambda s: s.end_time > end_time, self._in_feature_stream)) + self._out_feature_stream = list(filter(lambda s: s.end_time > end_time, self._out_feature_stream)) + + def pre_convert(self, start_time: float, time_length: float, extra_time: float): + start_time -= extra_time + time_length += extra_time * 2 - def convert_to_feature(self, start_time: float, time_length: float): end_time = start_time + time_length buffer_list = [] stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._data_stream) @@ -123,38 +156,161 @@ def convert_to_feature(self, start_time: float, time_length: float): buffer = numpy.concatenate(buffer_list) in_wave = Wave(wave=buffer, sampling_rate=self.sampling_rate) in_feature = self.vocoder.encode(in_wave) - out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature) - return out_feature - def convert(self, start_time: float, time_length: float): - feature = self.convert_to_feature(start_time=start_time, time_length=time_length) - out_wave = self.vocoder.decode( - acoustic_feature=feature, + pad = int(extra_time / (self.vocoder.acoustic_feature_param.frame_period / 1000)) + in_feature = AcousticFeature( + f0=in_feature.f0[pad:-pad], + spectrogram=in_feature.spectrogram[pad:-pad], + aperiodicity=in_feature.aperiodicity[pad:-pad], + mfcc=in_feature.mfcc[pad:-pad], + voiced=in_feature.voiced[pad:-pad], ) - return out_wave - - def convert_with_extra_time(self, start_time: float, time_length: float, extra_time: float): - """ - :param extra_time: 音声変換時に余分に使うデータの時間長。ゼロパディングを防ぐ。 - """ - frame_period = self.vocoder.acoustic_feature_param.frame_period + return in_feature + def convert(self, start_time: float, time_length: float, extra_time: float): start_time -= extra_time time_length += extra_time * 2 - extra_feature = self.convert_to_feature(start_time=start_time, time_length=time_length) + order = self.voice_changer.acoustic_converter.config.dataset.param.acoustic_feature_param.order + + end_time = start_time + time_length + f0_buffer_list = [] + mfcc_buffer_list = [] + ap_buffer_list = [] + voiced_buffer_list = [] + stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._in_feature_stream) + + start_time_buffer = start_time + remaining_time = time_length + for segment in stream: + # padding + if segment.start_time > start_time_buffer: + pad_size = int((segment.start_time - start_time_buffer) * 1000 / self.frame_period) + dims = AcousticFeature.get_sizes(self.sampling_rate, order) + + f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) + mfcc_buffer_list.append(numpy.zeros(shape=[pad_size, dims['mfcc']], dtype=self.in_dtype)) + ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype)) + voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=numpy.bool)) + + start_time_buffer = segment.start_time + if remaining_time > segment.end_time - start_time_buffer: + one_time_length = segment.end_time - start_time_buffer + else: + one_time_length = remaining_time + + first_index = int((start_time_buffer - segment.start_time) * 1000 / self.frame_period) + last_index = int(first_index + one_time_length * 1000 / self.frame_period) + + f0_buffer_list.append(segment.feature.f0[first_index:last_index]) + mfcc_buffer_list.append(segment.feature.mfcc[first_index:last_index]) + ap_buffer_list.append(segment.feature.aperiodicity[first_index:last_index]) + voiced_buffer_list.append(segment.feature.voiced[first_index:last_index]) + + start_time_buffer += one_time_length + remaining_time -= one_time_length + + if start_time_buffer >= end_time: + break + else: + # last padding + pad_size = int((end_time - start_time_buffer) * 1000 / self.frame_period) + dims = AcousticFeature.get_sizes(self.sampling_rate, order) + + f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) + mfcc_buffer_list.append(numpy.zeros(shape=[pad_size, dims['mfcc']], dtype=self.in_dtype)) + ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype)) + voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=numpy.bool)) + + f0 = numpy.concatenate(f0_buffer_list) + mfcc = numpy.concatenate(mfcc_buffer_list) + aperiodicity = numpy.concatenate(ap_buffer_list) + voiced = numpy.concatenate(voiced_buffer_list) + in_feature = AcousticFeature( + f0=f0, + spectrogram=numpy.nan, + aperiodicity=aperiodicity, + mfcc=mfcc, + voiced=voiced, + ) + + out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature) + + pad = int(extra_time * 1000 / self.frame_period) + out_feature= AcousticFeature( + f0=out_feature.f0[pad:-pad], + spectrogram=out_feature.spectrogram[pad:-pad], + aperiodicity=out_feature.aperiodicity[pad:-pad], + mfcc=out_feature.mfcc[pad:-pad], + voiced=out_feature.voiced[pad:-pad], + ) + return out_feature + + def post_convert(self, start_time: float, time_length: float): + end_time = start_time + time_length + f0_buffer_list = [] + sp_buffer_list = [] + ap_buffer_list = [] + voiced_buffer_list = [] + stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._out_feature_stream) + + start_time_buffer = start_time + remaining_time = time_length + for segment in stream: + # padding + if segment.start_time > start_time_buffer: + pad_size = int((segment.start_time - start_time_buffer) * 1000 / self.frame_period) + dims = AcousticFeature.get_sizes(self.sampling_rate, self.vocoder.acoustic_feature_param.order) + + f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) + sp_buffer_list.append(numpy.zeros(shape=[pad_size, dims['spectrogram']], dtype=self.in_dtype)) + ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype)) + voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=numpy.bool)) + + start_time_buffer = segment.start_time + + if remaining_time > segment.end_time - start_time_buffer: + one_time_length = segment.end_time - start_time_buffer + else: + one_time_length = remaining_time - pad = int(extra_time / (frame_period / 1000)) - feature = AcousticFeature( - f0=extra_feature.f0[pad:-pad], - spectrogram=extra_feature.spectrogram[pad:-pad], - aperiodicity=extra_feature.aperiodicity[pad:-pad], - mfcc=extra_feature.mfcc[pad:-pad], - voiced=extra_feature.voiced[pad:-pad], + first_index = int((start_time_buffer - segment.start_time) * 1000 / self.frame_period) + last_index = int(first_index + one_time_length * 1000 / self.frame_period) + + f0_buffer_list.append(segment.feature.f0[first_index:last_index]) + sp_buffer_list.append(segment.feature.spectrogram[first_index:last_index]) + ap_buffer_list.append(segment.feature.aperiodicity[first_index:last_index]) + voiced_buffer_list.append(segment.feature.voiced[first_index:last_index]) + + start_time_buffer += one_time_length + remaining_time -= one_time_length + + if start_time_buffer >= end_time: + break + else: + # last padding + pad_size = int((end_time - start_time_buffer) * 1000 / self.frame_period) + dims = AcousticFeature.get_sizes(self.sampling_rate, self.vocoder.acoustic_feature_param.order) + + f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) + sp_buffer_list.append(numpy.zeros(shape=[pad_size, dims['spectrogram']], dtype=self.in_dtype)) + ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype)) + voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) + + f0 = numpy.concatenate(f0_buffer_list) + spectrogram = numpy.concatenate(sp_buffer_list) + aperiodicity = numpy.concatenate(ap_buffer_list) + voiced = numpy.concatenate(voiced_buffer_list) + out_feature = AcousticFeature( + f0=f0, + spectrogram=spectrogram, + aperiodicity=aperiodicity, + mfcc=numpy.nan, + voiced=voiced, ) out_wave = self.vocoder.decode( - acoustic_feature=feature, + acoustic_feature=out_feature, ) return out_wave @@ -163,20 +319,46 @@ class VoiceChangerStreamWrapper(object): def __init__( self, voice_changer_stream: VoiceChangerStream, - extra_time: float = 0.0 + extra_time_pre: float = 0.0, + extra_time: float = 0.0, ): self.voice_changer_stream = voice_changer_stream + self.extra_time_pre = extra_time_pre self.extra_time = extra_time + self._current_time_pre = 0 self._current_time = 0 + self._current_time_post = 0 + + def pre_convert_next(self, time_length: float): + in_feature = self.voice_changer_stream.pre_convert( + start_time=self._current_time_pre, + time_length=time_length, + extra_time=self.extra_time_pre, + ) + self._current_time_pre += time_length + return in_feature def convert_next(self, time_length: float): - out_wave = self.voice_changer_stream.convert_with_extra_time( + out_feature = self.voice_changer_stream.convert( start_time=self._current_time, time_length=time_length, extra_time=self.extra_time, ) self._current_time += time_length + return out_feature + + def post_convert_next(self, time_length: float): + out_wave = self.voice_changer_stream.post_convert( + start_time=self._current_time_post, + time_length=time_length, + ) + self._current_time_post += time_length return out_wave - def remove_previous_wave(self): - self.voice_changer_stream.remove_wave(end_time=self._current_time - self.extra_time) + def remove_previous(self): + end_time = min( + self._current_time_pre - self.extra_time_pre, + self._current_time - self.extra_time, + self._current_time_post, + ) + self.voice_changer_stream.remove(end_time=end_time) diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py index a5d1a21..e96ce4e 100644 --- a/scripts/realtime_voice_changer.py +++ b/scripts/realtime_voice_changer.py @@ -14,92 +14,128 @@ import pyaudio from become_yukarin import AcousticConverter +from become_yukarin import Vocoder from become_yukarin import RealtimeVocoder from become_yukarin import SuperResolution from become_yukarin import VoiceChanger +from become_yukarin.config.config import Config from become_yukarin.config.config import create_from_json as create_config from become_yukarin.config.sr_config import create_from_json as create_sr_config from become_yukarin.data_struct import Wave +from become_yukarin.data_struct import AcousticFeature from become_yukarin.voice_changer import VoiceChangerStream from become_yukarin.voice_changer import VoiceChangerStreamWrapper class AudioConfig(NamedTuple): rate: int + frame_period: float audio_chunk: int convert_chunk: int vocoder_buffer_size: int out_norm: float -def convert_worker( - config, - acoustic_converter, - super_resolution, +def encode_worker( + config: Config, + wrapper: VoiceChangerStreamWrapper, audio_config: AudioConfig, - queue_input_wave, - queue_output_wave, + queue_input: Queue, + queue_output: Queue, ): - vocoder = RealtimeVocoder( + wrapper.voice_changer_stream.vocoder = Vocoder( acoustic_feature_param=config.dataset.param.acoustic_feature_param, out_sampling_rate=audio_config.rate, - buffer_size=audio_config.vocoder_buffer_size, - number_of_pointers=16, ) - # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate) - voice_changer = VoiceChanger( + start_time = 0 + time_length = audio_config.convert_chunk / audio_config.rate + + while True: + wave = queue_input.get() + + w = Wave(wave=wave, sampling_rate=audio_config.rate) + wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w) + start_time += time_length + + feature = wrapper.pre_convert_next(time_length=time_length) + queue_output.put(feature) + + +def convert_worker( + config: Config, + wrapper: VoiceChangerStreamWrapper, + acoustic_converter: AcousticConverter, + super_resolution: SuperResolution, + audio_config: AudioConfig, + queue_input: Queue, + queue_output: Queue, +): + wrapper.voice_changer_stream.voice_changer = VoiceChanger( super_resolution=super_resolution, acoustic_converter=acoustic_converter, - vocoder=vocoder, ) - voice_changer_stream = VoiceChangerStream( - voice_changer=voice_changer, - sampling_rate=audio_config.rate, - in_dtype=numpy.float32, - ) + start_time = 0 + time_length = audio_config.convert_chunk / audio_config.rate + while True: + in_feature: AcousticFeature = queue_input.get() + wrapper.voice_changer_stream.add_in_feature( + start_time=start_time, + feature=in_feature, + frame_period=audio_config.frame_period, + ) + start_time += time_length - wrapper = VoiceChangerStreamWrapper( - voice_changer_stream=voice_changer_stream, - extra_time=0.1, + out_feature = wrapper.convert_next(time_length=time_length) + queue_output.put(out_feature) + + +def decode_worker( + config: Config, + wrapper: VoiceChangerStreamWrapper, + audio_config: AudioConfig, + queue_input: Queue, + queue_output: Queue, +): + wrapper.voice_changer_stream.vocoder = RealtimeVocoder( + acoustic_feature_param=config.dataset.param.acoustic_feature_param, + out_sampling_rate=audio_config.rate, + buffer_size=audio_config.vocoder_buffer_size, + number_of_pointers=16, ) + # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate) start_time = 0 - wave = numpy.zeros(audio_config.convert_chunk * 2, dtype=numpy.float32) - wave = Wave(wave=wave, sampling_rate=audio_config.rate) - wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave) - start_time += len(wave.wave) / wave.sampling_rate - wave = wrapper.convert_next(time_length=1) - time_length = audio_config.convert_chunk / audio_config.rate wave_fragment = numpy.empty(0) while True: - wave = queue_input_wave.get() - w = Wave(wave=wave, sampling_rate=audio_config.rate) - wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w) + feature: AcousticFeature = queue_input.get() + wrapper.voice_changer_stream.add_out_feature( + start_time=start_time, + feature=feature, + frame_period=audio_config.frame_period, + ) start_time += time_length - b = time.time() - wave = wrapper.convert_next(time_length=time_length).wave - print('time', time.time()-b, flush=True) - wrapper.remove_previous_wave() - print('converted wave', len(wave), flush=True) + wave = wrapper.post_convert_next(time_length=time_length).wave wave_fragment = numpy.concatenate([wave_fragment, wave]) if len(wave_fragment) >= audio_config.audio_chunk: wave, wave_fragment = wave_fragment[:audio_config.audio_chunk], wave_fragment[audio_config.audio_chunk:] - queue_output_wave.put(wave) + queue_output.put(wave) def main(): print('model loading...', flush=True) queue_input_wave = Queue() + queue_input_feature = Queue() + queue_output_feature = Queue() queue_output_wave = Queue() - model_path = Path('./trained/harvest-innoise03/predictor_1390000.npz') - config_path = Path('./trained/harvest-innoise03/config.json') + model_path = Path('./trained/pp-weakD-innoise01-tarnoise001/predictor_120000.npz') + config_path = Path('./trained/pp-weakD-innoise01-tarnoise001/config.json') config = create_config(config_path) acoustic_converter = AcousticConverter(config, model_path, gpu=0) print('model 1 loaded!', flush=True) @@ -113,23 +149,53 @@ def main(): audio_instance = pyaudio.PyAudio() audio_config = AudioConfig( rate=config.dataset.param.voice_param.sample_rate, + frame_period=config.dataset.param.acoustic_feature_param.frame_period, audio_chunk=config.dataset.param.voice_param.sample_rate, convert_chunk=config.dataset.param.voice_param.sample_rate, vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16, out_norm=2.5, ) - process_converter = Process(target=convert_worker, kwargs=dict( + voice_changer_stream = VoiceChangerStream( + sampling_rate=audio_config.rate, + frame_period=config.dataset.param.acoustic_feature_param.frame_period, + in_dtype=numpy.float32, + ) + + wrapper = VoiceChangerStreamWrapper( + voice_changer_stream=voice_changer_stream, + extra_time_pre=0.2, + extra_time=0.1, + ) + + process_encoder = Process(target=encode_worker, kwargs=dict( config=config, + wrapper=wrapper, audio_config=audio_config, + queue_input=queue_input_wave, + queue_output=queue_input_feature, + )) + process_encoder.start() + + process_converter = Process(target=convert_worker, kwargs=dict( + config=config, + wrapper=wrapper, acoustic_converter=acoustic_converter, super_resolution=super_resolution, - queue_input_wave=queue_input_wave, - queue_output_wave=queue_output_wave, + audio_config=audio_config, + queue_input=queue_input_feature, + queue_output=queue_output_feature, )) process_converter.start() - signal.signal(signal.SIGINT, lambda signum, frame: process_converter.terminate()) + process_decoder = Process(target=decode_worker, kwargs=dict( + config=config, + wrapper=wrapper, + audio_config=audio_config, + queue_input=queue_output_feature, + queue_output=queue_output_wave, + )) + process_decoder.start() audio_stream = audio_instance.open( format=pyaudio.paFloat32, @@ -149,6 +215,11 @@ def main(): print('input', len(wave), flush=True) queue_input_wave.put(wave) + print('queue_input_wave', queue_input_wave.qsize(), flush=True) + print('queue_input_feature', queue_input_feature.qsize(), flush=True) + print('queue_output_feature', queue_output_feature.qsize(), flush=True) + print('queue_output_wave', queue_output_wave.qsize(), flush=True) + # output try: wave = queue_output_wave.get_nowait() diff --git a/tests/test_voice_changer.py b/tests/test_voice_changer.py index ceddf9c..66ea003 100644 --- a/tests/test_voice_changer.py +++ b/tests/test_voice_changer.py @@ -32,16 +32,16 @@ class AudioConfig(NamedTuple): print('model loading...', flush=True) -model_path = model_base_path / Path('harvest-innoise03/predictor_1390000.npz') -config_path = model_base_path / Path('harvest-innoise03/config.json') +model_path = model_base_path / Path('pp-weakD-innoise01-tarnoise001/predictor_120000.npz') +config_path = model_base_path / Path('pp-weakD-innoise01-tarnoise001/config.json') config = create_config(config_path) -acoustic_converter = AcousticConverter(config, model_path, gpu=0) +acoustic_converter = AcousticConverter(config, model_path) print('model 1 loaded!', flush=True) model_path = model_base_path / Path('sr-noise3/predictor_180000.npz') config_path = model_base_path / Path('sr-noise3/config.json') sr_config = create_sr_config(config_path) -super_resolution = SuperResolution(sr_config, model_path, gpu=0) +super_resolution = SuperResolution(sr_config, model_path) print('model 2 loaded!', flush=True) audio_config = AudioConfig( @@ -50,6 +50,7 @@ class AudioConfig(NamedTuple): vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16, out_norm=4.5, ) +frame_period = config.dataset.param.acoustic_feature_param.frame_period vocoder = RealtimeVocoder( acoustic_feature_param=config.dataset.param.acoustic_feature_param, @@ -57,22 +58,24 @@ class AudioConfig(NamedTuple): buffer_size=audio_config.vocoder_buffer_size, number_of_pointers=16, ) -# vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate) voice_changer = VoiceChanger( super_resolution=super_resolution, acoustic_converter=acoustic_converter, - vocoder=vocoder, ) voice_changer_stream = VoiceChangerStream( - voice_changer=voice_changer, sampling_rate=audio_config.rate, + frame_period=acoustic_converter._param.acoustic_feature_param.frame_period, in_dtype=numpy.float32, ) +voice_changer_stream.voice_changer = voice_changer +voice_changer_stream.vocoder = vocoder + wrapper = VoiceChangerStreamWrapper( voice_changer_stream=voice_changer_stream, + extra_time_pre=1, extra_time=0.2, ) @@ -85,9 +88,26 @@ class AudioConfig(NamedTuple): wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave_in) start_time += len(wave_in.wave) / wave_in.sampling_rate - wave_out = wrapper.convert_next(time_length=audio_config.chunk / audio_config.rate) +start_time = 0 +for i in range(len(raw_wave) // audio_config.chunk + 1): + feature_in = wrapper.pre_convert_next(time_length=audio_config.chunk / audio_config.rate) + wrapper.voice_changer_stream.add_in_feature(start_time=start_time, feature=feature_in, frame_period=frame_period) + start_time += audio_config.chunk / audio_config.rate + print('pre', i, flush=True) + +start_time = 0 +for i in range(len(raw_wave) // audio_config.chunk + 1): + feature_out = wrapper.convert_next(time_length=audio_config.chunk / audio_config.rate) + wrapper.voice_changer_stream.add_out_feature(start_time=start_time, feature=feature_out, frame_period=frame_period) + start_time += audio_config.chunk / audio_config.rate + print('cent', i, flush=True) + +start_time = 0 +for i in range(len(raw_wave) // audio_config.chunk + 1): + wave_out = wrapper.post_convert_next(time_length=audio_config.chunk / audio_config.rate) wave_out_list.append(wave_out) - wrapper.remove_previous_wave() + start_time += audio_config.chunk / audio_config.rate + print('post', i, flush=True) out_wave = numpy.concatenate([w.wave for w in wave_out_list]).astype(numpy.float32) librosa.output.write_wav(str(test_output_path), out_wave, sr=audio_config.rate) From f279994afdba8e08fc5e042a25f50db548ddbae3 Mon Sep 17 00:00:00 2001 From: Hiroshiba Kazuyuki Date: Fri, 9 Mar 2018 02:52:24 +0900 Subject: [PATCH 2/2] =?UTF-8?q?=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF=E3=83=AA=E3=83=B3=E3=82=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- become_yukarin/data_struct.py | 64 ++++++++- become_yukarin/voice_changer.py | 243 ++++++++++---------------------- 2 files changed, 135 insertions(+), 172 deletions(-) diff --git a/become_yukarin/data_struct.py b/become_yukarin/data_struct.py index 78c8cf3..4474331 100644 --- a/become_yukarin/data_struct.py +++ b/become_yukarin/data_struct.py @@ -1,8 +1,9 @@ -from typing import NamedTuple +from typing import NamedTuple, Dict, List +import numpy import pyworld -import numpy +_min_mc = -18.3 class Wave(NamedTuple): @@ -11,11 +12,21 @@ class Wave(NamedTuple): class AcousticFeature(NamedTuple): - f0: numpy.ndarray - spectrogram: numpy.ndarray - aperiodicity: numpy.ndarray - mfcc: numpy.ndarray - voiced: numpy.ndarray + f0: numpy.ndarray = numpy.nan + spectrogram: numpy.ndarray = numpy.nan + aperiodicity: numpy.ndarray = numpy.nan + mfcc: numpy.ndarray = numpy.nan + voiced: numpy.ndarray = numpy.nan + + @staticmethod + def dtypes(): + return dict( + f0=numpy.float32, + spectrogram=numpy.float32, + aperiodicity=numpy.float32, + mfcc=numpy.float32, + voiced=numpy.bool, + ) def astype(self, dtype): return AcousticFeature( @@ -50,6 +61,45 @@ def validate(self): assert self.voiced.dtype == numpy.bool + @staticmethod + def silent(length: int, sizes: Dict[str, int], keys: List[str]): + d = {} + if 'f0' in keys: + d['f0'] = numpy.zeros((length, sizes['f0']), dtype=AcousticFeature.dtypes()['f0']) + if 'spectrogram' in keys: + d['spectrogram'] = numpy.zeros((length, sizes['spectrogram']), + dtype=AcousticFeature.dtypes()['spectrogram']) + if 'aperiodicity' in keys: + d['aperiodicity'] = numpy.zeros((length, sizes['aperiodicity']), + dtype=AcousticFeature.dtypes()['aperiodicity']) + if 'mfcc' in keys: + d['mfcc'] = numpy.hstack(( + numpy.ones((length, 1), dtype=AcousticFeature.dtypes()['mfcc']) * _min_mc, + numpy.zeros((length, sizes['mfcc'] - 1), dtype=AcousticFeature.dtypes()['mfcc']) + )) + if 'voiced' in keys: + d['voiced'] = numpy.zeros((length, sizes['voiced']), dtype=AcousticFeature.dtypes()['voiced']) + feature = AcousticFeature(**d) + return feature + + @staticmethod + def concatenate(fs: List['AcousticFeature'], keys: List[str]): + is_target = lambda a: not numpy.any(numpy.isnan(a)) + return AcousticFeature(**{ + key: numpy.concatenate([getattr(f, key) for f in fs]) if is_target(getattr(fs[0], key)) else numpy.nan + for key in keys + }) + + def pick(self, first: int, last: int): + is_target = lambda a: not numpy.any(numpy.isnan(a)) + return AcousticFeature( + f0=self.f0[first:last] if is_target(self.f0) else numpy.nan, + spectrogram=self.spectrogram[first:last] if is_target(self.spectrogram) else numpy.nan, + aperiodicity=self.aperiodicity[first:last] if is_target(self.aperiodicity) else numpy.nan, + mfcc=self.mfcc[first:last] if is_target(self.mfcc) else numpy.nan, + voiced=self.voiced[first:last] if is_target(self.voiced) else numpy.nan, + ) + @staticmethod def get_sizes(sampling_rate: int, order: int): fft_size = pyworld.get_cheaptrick_fft_size(fs=sampling_rate) diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py index 3b75fb1..bed155f 100644 --- a/become_yukarin/voice_changer.py +++ b/become_yukarin/voice_changer.py @@ -1,4 +1,5 @@ -from typing import List +from abc import ABCMeta, abstractproperty, abstractmethod +from typing import List, Callable, Any from typing import NamedTuple import numpy @@ -24,16 +25,6 @@ def __init__( self.super_resolution = super_resolution self.output_sampling_rate = output_sampling_rate - # def convert_from_wave_path(self, wave_path: str): - # w_in = self.acoustic_converter._wave_process(wave_path) - # return self.convert_from_wave(w_in) - # - # def convert_from_wave(self, wave: Wave): - # f_in = self.acoustic_converter._feature_process(wave) - # f_high = self.convert_from_acoustic_feature(f_in) - # wave = self.vocoder.decode(f_high) - # return wave - def convert_from_acoustic_feature(self, f_in: AcousticFeature): f_low = self.acoustic_converter.convert_to_feature(f_in) s_high = self.super_resolution.convert(f_low.spectrogram.astype(numpy.float32)) @@ -41,7 +32,21 @@ def convert_from_acoustic_feature(self, f_in: AcousticFeature): return f_high -class FeatureSegment(NamedTuple): +class BaseSegment(ABCMeta): + start_time: float + + @property + @abstractmethod + def time_length(self) -> float: + pass + + @property + @abstractmethod + def end_time(self) -> float: + pass + + +class FeatureSegment(NamedTuple, BaseSegment): start_time: float feature: AcousticFeature frame_period: float @@ -55,7 +60,7 @@ def end_time(self): return self.time_length + self.start_time -class Segment(NamedTuple): +class WaveSegment(NamedTuple, BaseSegment): start_time: float wave: Wave @@ -81,7 +86,7 @@ def __init__( self.voice_changer: VoiceChanger = None self.vocoder: Vocoder = None - self._data_stream = [] # type: List[Segment] + self._data_stream = [] # type: List[WaveSegment] self._in_feature_stream = [] # type: List[FeatureSegment] self._out_feature_stream = [] # type: List[FeatureSegment] @@ -90,7 +95,7 @@ def add_wave(self, start_time: float, wave: Wave): assert wave.sampling_rate == self.sampling_rate assert wave.wave.dtype == self.in_dtype - segment = Segment(start_time=start_time, wave=wave) + segment = WaveSegment(start_time=start_time, wave=wave) self._data_stream.append(segment) def add_in_feature(self, start_time: float, feature: AcousticFeature, frame_period: float): @@ -113,23 +118,31 @@ def remove(self, end_time: float): self._in_feature_stream = list(filter(lambda s: s.end_time > end_time, self._in_feature_stream)) self._out_feature_stream = list(filter(lambda s: s.end_time > end_time, self._out_feature_stream)) - def pre_convert(self, start_time: float, time_length: float, extra_time: float): + @staticmethod + def fetch( + start_time: float, + time_length: float, + data_stream: List[BaseSegment], + rate: float, + pad_function: Callable[[int], Any], + pick_function: Callable[[Any, int, int], Any], + concat_function: Callable[[List], Any], + extra_time: float = 0, + ): start_time -= extra_time time_length += extra_time * 2 end_time = start_time + time_length buffer_list = [] - stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._data_stream) + stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), data_stream) start_time_buffer = start_time remaining_time = time_length for segment in stream: # padding if segment.start_time > start_time_buffer: - pad = numpy.zeros( - shape=int((segment.start_time - start_time_buffer) * self.sampling_rate), - dtype=self.in_dtype, - ) + length = int((segment.start_time - start_time_buffer) * rate) + pad = pad_function(length) buffer_list.append(pad) start_time_buffer = segment.start_time @@ -138,9 +151,9 @@ def pre_convert(self, start_time: float, time_length: float, extra_time: float): else: one_time_length = remaining_time - first_index = int((start_time_buffer - segment.start_time) * self.sampling_rate) - last_index = int(first_index + one_time_length * self.sampling_rate) - one_buffer = segment.wave.wave[first_index:last_index] + first_index = int((start_time_buffer - segment.start_time) * rate) + last_index = int(first_index + one_time_length * rate) + one_buffer = pick_function(segment, first_index, last_index) buffer_list.append(one_buffer) start_time_buffer += one_time_length @@ -150,163 +163,63 @@ def pre_convert(self, start_time: float, time_length: float, extra_time: float): break else: # last padding - pad = numpy.zeros(shape=int((end_time - start_time_buffer) * self.sampling_rate), dtype=self.in_dtype) + length = int((end_time - start_time_buffer) * rate) + pad = pad_function(length) buffer_list.append(pad) - buffer = numpy.concatenate(buffer_list) - in_wave = Wave(wave=buffer, sampling_rate=self.sampling_rate) + buffer = concat_function(buffer_list) + return buffer + + def pre_convert(self, start_time: float, time_length: float, extra_time: float): + wave = self.fetch( + start_time=start_time, + time_length=time_length, + extra_time=extra_time, + data_stream=self._data_stream, + rate=self.sampling_rate, + pad_function=lambda length: numpy.zeros(shape=length, dtype=self.in_dtype), + pick_function=lambda segment, first, last: segment.wave.wave[first:last], + concat_function=numpy.concatenate, + ) + in_wave = Wave(wave=wave, sampling_rate=self.sampling_rate) in_feature = self.vocoder.encode(in_wave) pad = int(extra_time / (self.vocoder.acoustic_feature_param.frame_period / 1000)) - in_feature = AcousticFeature( - f0=in_feature.f0[pad:-pad], - spectrogram=in_feature.spectrogram[pad:-pad], - aperiodicity=in_feature.aperiodicity[pad:-pad], - mfcc=in_feature.mfcc[pad:-pad], - voiced=in_feature.voiced[pad:-pad], - ) + in_feature = in_feature.pick(pad, -pad) return in_feature def convert(self, start_time: float, time_length: float, extra_time: float): - start_time -= extra_time - time_length += extra_time * 2 - order = self.voice_changer.acoustic_converter.config.dataset.param.acoustic_feature_param.order - - end_time = start_time + time_length - f0_buffer_list = [] - mfcc_buffer_list = [] - ap_buffer_list = [] - voiced_buffer_list = [] - stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._in_feature_stream) - - start_time_buffer = start_time - remaining_time = time_length - for segment in stream: - # padding - if segment.start_time > start_time_buffer: - pad_size = int((segment.start_time - start_time_buffer) * 1000 / self.frame_period) - dims = AcousticFeature.get_sizes(self.sampling_rate, order) - - f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) - mfcc_buffer_list.append(numpy.zeros(shape=[pad_size, dims['mfcc']], dtype=self.in_dtype)) - ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype)) - voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=numpy.bool)) - - start_time_buffer = segment.start_time - if remaining_time > segment.end_time - start_time_buffer: - one_time_length = segment.end_time - start_time_buffer - else: - one_time_length = remaining_time - - first_index = int((start_time_buffer - segment.start_time) * 1000 / self.frame_period) - last_index = int(first_index + one_time_length * 1000 / self.frame_period) - - f0_buffer_list.append(segment.feature.f0[first_index:last_index]) - mfcc_buffer_list.append(segment.feature.mfcc[first_index:last_index]) - ap_buffer_list.append(segment.feature.aperiodicity[first_index:last_index]) - voiced_buffer_list.append(segment.feature.voiced[first_index:last_index]) - - start_time_buffer += one_time_length - remaining_time -= one_time_length - - if start_time_buffer >= end_time: - break - else: - # last padding - pad_size = int((end_time - start_time_buffer) * 1000 / self.frame_period) - dims = AcousticFeature.get_sizes(self.sampling_rate, order) - - f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) - mfcc_buffer_list.append(numpy.zeros(shape=[pad_size, dims['mfcc']], dtype=self.in_dtype)) - ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype)) - voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=numpy.bool)) - - f0 = numpy.concatenate(f0_buffer_list) - mfcc = numpy.concatenate(mfcc_buffer_list) - aperiodicity = numpy.concatenate(ap_buffer_list) - voiced = numpy.concatenate(voiced_buffer_list) - in_feature = AcousticFeature( - f0=f0, - spectrogram=numpy.nan, - aperiodicity=aperiodicity, - mfcc=mfcc, - voiced=voiced, + sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=order) + keys = ['f0', 'aperiodicity', 'mfcc', 'voiced'] + in_feature = self.fetch( + start_time=start_time, + time_length=time_length, + extra_time=extra_time, + data_stream=self._in_feature_stream, + rate=1000 / self.frame_period, + pad_function=lambda length: AcousticFeature.silent(length, sizes=sizes, keys=keys), + pick_function=lambda segment, first, last: segment.feature.pick(first, last), + concat_function=lambda buffers: AcousticFeature.concatenate(buffers, keys=keys), ) - out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature) pad = int(extra_time * 1000 / self.frame_period) - out_feature= AcousticFeature( - f0=out_feature.f0[pad:-pad], - spectrogram=out_feature.spectrogram[pad:-pad], - aperiodicity=out_feature.aperiodicity[pad:-pad], - mfcc=out_feature.mfcc[pad:-pad], - voiced=out_feature.voiced[pad:-pad], - ) + out_feature = out_feature.pick(pad, -pad) return out_feature def post_convert(self, start_time: float, time_length: float): - end_time = start_time + time_length - f0_buffer_list = [] - sp_buffer_list = [] - ap_buffer_list = [] - voiced_buffer_list = [] - stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._out_feature_stream) - - start_time_buffer = start_time - remaining_time = time_length - for segment in stream: - # padding - if segment.start_time > start_time_buffer: - pad_size = int((segment.start_time - start_time_buffer) * 1000 / self.frame_period) - dims = AcousticFeature.get_sizes(self.sampling_rate, self.vocoder.acoustic_feature_param.order) - - f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) - sp_buffer_list.append(numpy.zeros(shape=[pad_size, dims['spectrogram']], dtype=self.in_dtype)) - ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype)) - voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=numpy.bool)) - - start_time_buffer = segment.start_time - - if remaining_time > segment.end_time - start_time_buffer: - one_time_length = segment.end_time - start_time_buffer - else: - one_time_length = remaining_time - - first_index = int((start_time_buffer - segment.start_time) * 1000 / self.frame_period) - last_index = int(first_index + one_time_length * 1000 / self.frame_period) - - f0_buffer_list.append(segment.feature.f0[first_index:last_index]) - sp_buffer_list.append(segment.feature.spectrogram[first_index:last_index]) - ap_buffer_list.append(segment.feature.aperiodicity[first_index:last_index]) - voiced_buffer_list.append(segment.feature.voiced[first_index:last_index]) - - start_time_buffer += one_time_length - remaining_time -= one_time_length - - if start_time_buffer >= end_time: - break - else: - # last padding - pad_size = int((end_time - start_time_buffer) * 1000 / self.frame_period) - dims = AcousticFeature.get_sizes(self.sampling_rate, self.vocoder.acoustic_feature_param.order) - - f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) - sp_buffer_list.append(numpy.zeros(shape=[pad_size, dims['spectrogram']], dtype=self.in_dtype)) - ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype)) - voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) - - f0 = numpy.concatenate(f0_buffer_list) - spectrogram = numpy.concatenate(sp_buffer_list) - aperiodicity = numpy.concatenate(ap_buffer_list) - voiced = numpy.concatenate(voiced_buffer_list) - out_feature = AcousticFeature( - f0=f0, - spectrogram=spectrogram, - aperiodicity=aperiodicity, - mfcc=numpy.nan, - voiced=voiced, + order = self.voice_changer.acoustic_converter.config.dataset.param.acoustic_feature_param.order + sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=order) + keys = ['f0', 'aperiodicity', 'spectrogram', 'voiced'] + out_feature = self.fetch( + start_time=start_time, + time_length=time_length, + data_stream=self._out_feature_stream, + rate=1000 / self.frame_period, + pad_function=lambda length: AcousticFeature.silent(length, sizes=sizes, keys=keys), + pick_function=lambda segment, first, last: segment.feature.pick(first, last), + concat_function=lambda buffers: AcousticFeature.concatenate(buffers, keys=keys), ) out_wave = self.vocoder.decode(