diff --git a/note_seq/audio_io.py b/note_seq/audio_io.py index f917ec7..ca26c32 100644 --- a/note_seq/audio_io.py +++ b/note_seq/audio_io.py @@ -20,6 +20,7 @@ import librosa import numpy as np +import pydub import scipy @@ -49,6 +50,49 @@ def float_samples_to_int16(y): return (y * np.iinfo(np.int16).max).astype(np.int16) +def wav_data_to_samples_pydub(wav_data: bytes, + sample_rate: int, + remove_dc_bias: bool = False, + num_channels: int = None, + normalize_db: float = None): + """Convert audio file data (in bytes) into a numpy array using Pydub. + + Args: + wav_data: A byte stream of audio data. + sample_rate: Resample recorded audio to this sample rate. + remove_dc_bias: If true, will remove DC bias from audio. + num_channels: If not specified, output shape will be based on the contents + of wav_data. Otherwise, will force to be 1 or 2 channels. + normalize_db: Normalize the audio to this many decibels. Set to None to skip + normalization step. + + Returns: + An array of the recorded audio at sample_rate. If mono, will be shape + [samples], otherwise [channels, samples]. + """ + # Parse and normalize the audio. + aseg = pydub.AudioSegment.from_file(io.BytesIO(wav_data)) + if num_channels: + aseg = aseg.set_channels(num_channels) + if remove_dc_bias: + aseg = aseg.remove_dc_offset() + if normalize_db is not None: + aseg.normalize(headroom=normalize_db) + aseg = aseg.set_frame_rate(sample_rate) + + # Convert to numpy array. + channel_asegs = aseg.split_to_mono() + samples = [s.get_array_of_samples() for s in channel_asegs] + fp_arr = np.array(samples).astype(np.float32) + fp_arr /= np.iinfo(samples[0].typecode).max + + # If only 1 channel, remove extra dim. + if fp_arr.shape[0] == 1: + fp_arr = fp_arr[0] + + return fp_arr + + def wav_data_to_samples(wav_data, sample_rate): """Read PCM-formatted WAV data and return a NumPy array of samples. diff --git a/note_seq/audio_io_test.py b/note_seq/audio_io_test.py index 9b69bed..5303bf8 100644 --- a/note_seq/audio_io_test.py +++ b/note_seq/audio_io_test.py @@ -65,6 +65,29 @@ def testFloatWavDataToSamples(self): wav_io.getvalue(), sample_rate=16000) np.testing.assert_array_equal(y, y_from_float) + def testWavDataToSamplesPydub(self): + w = wave.open(self.wav_filename, 'rb') + w_mono = wave.open(self.wav_filename_mono, 'rb') + + # Check content size. + y = audio_io.wav_data_to_samples_pydub( + self.wav_data, sample_rate=16000, num_channels=1) + y_mono = audio_io.wav_data_to_samples_pydub( + self.wav_data_mono, sample_rate=22050, num_channels=1) + self.assertEqual( + round(16000.0 * w.getnframes() / w.getframerate()), y.shape[0]) + self.assertEqual( + round(22050.0 * w_mono.getnframes() / w_mono.getframerate()), + y_mono.shape[0]) + + # Check a few obvious failure modes. + self.assertLess(0.01, y.std()) + self.assertLess(0.01, y_mono.std()) + self.assertGreater(-0.1, y.min()) + self.assertGreater(-0.1, y_mono.min()) + self.assertLess(0.1, y.max()) + self.assertLess(0.1, y_mono.max()) + def testRepeatSamplesToDuration(self): samples = np.arange(5) repeated = audio_io.repeat_samples_to_duration( diff --git a/setup.py b/setup.py index 24e31dc..13c5cf9 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ 'pandas >= 0.18.1', 'pretty_midi >= 0.2.6', 'protobuf >= 3.6.1', + 'pydub', 'scipy >= 0.18.1', ]