-
Notifications
You must be signed in to change notification settings - Fork 49
/
audio_utilities.py
317 lines (261 loc) · 11.8 KB
/
audio_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
import math
import sys
import time
import numpy as np
import wave
import scipy
import scipy.signal
from pylab import *
import array
import os
from os.path import expanduser
import scipy.io.wavfile
# Author: Brian K. Vogel
def hz_to_mel(f_hz):
"""Convert Hz to mel scale.
This uses the formula from O'Shaugnessy's book.
Args:
f_hz (float): The value in Hz.
Returns:
The value in mels.
"""
return 2595*np.log10(1.0 + f_hz/700.0)
def mel_to_hz(m_mel):
"""Convert mel scale to Hz.
This uses the formula from O'Shaugnessy's book.
Args:
m_mel (float): The value in mels
Returns:
The value in Hz
"""
return 700*(10**(m_mel/2595) - 1.0)
def fft_bin_to_hz(n_bin, sample_rate_hz, fft_size):
"""Convert FFT bin index to frequency in Hz.
Args:
n_bin (int or float): The FFT bin index.
sample_rate_hz (int or float): The sample rate in Hz.
fft_size (int or float): The FFT size.
Returns:
The value in Hz.
"""
n_bin = float(n_bin)
sample_rate_hz = float(sample_rate_hz)
fft_size = float(fft_size)
return n_bin*sample_rate_hz/(2.0*fft_size)
def hz_to_fft_bin(f_hz, sample_rate_hz, fft_size):
"""Convert frequency in Hz to FFT bin index.
Args:
f_hz (int or float): The frequency in Hz.
sample_rate_hz (int or float): The sample rate in Hz.
fft_size (int or float): The FFT size.
Returns:
The FFT bin index as an int.
"""
f_hz = float(f_hz)
sample_rate_hz = float(sample_rate_hz)
fft_size = float(fft_size)
fft_bin = int(np.round((f_hz*2.0*fft_size/sample_rate_hz)))
if fft_bin >= fft_size:
fft_bin = fft_size-1
return fft_bin
def make_mel_filterbank(min_freq_hz, max_freq_hz, mel_bin_count,
linear_bin_count, sample_rate_hz):
"""Create a mel filterbank matrix.
Create and return a mel filterbank matrix `filterbank` of shape (`mel_bin_count`,
`linear_bin_couont`). The `filterbank` matrix can be used to transform a
(linear scale) spectrum or spectrogram into a mel scale spectrum or
spectrogram as follows:
`mel_scale_spectrum` = `filterbank`*'linear_scale_spectrum'
where linear_scale_spectrum' is a shape (`linear_bin_count`, `m`) and
`mel_scale_spectrum` is shape ('mel_bin_count', `m`) where `m` is the number
of spectral time slices.
Likewise, the reverse-direction transform can be performed as:
'linear_scale_spectrum' = filterbank.T`*`mel_scale_spectrum`
Note that the process of converting to mel scale and then back to linear
scale is lossy.
This function computes the mel-spaced filters such that each filter is triangular
(in linear frequency) with response 1 at the center frequency and decreases linearly
to 0 upon reaching an adjacent filter's center frequency. Note that any two adjacent
filters will overlap having a response of 0.5 at the mean frequency of their
respective center frequencies.
Args:
min_freq_hz (float): The frequency in Hz corresponding to the lowest
mel scale bin.
max_freq_hz (flloat): The frequency in Hz corresponding to the highest
mel scale bin.
mel_bin_count (int): The number of mel scale bins.
linear_bin_count (int): The number of linear scale (fft) bins.
sample_rate_hz (float): The sample rate in Hz.
Returns:
The mel filterbank matrix as an 2-dim Numpy array.
"""
min_mels = hz_to_mel(min_freq_hz)
max_mels = hz_to_mel(max_freq_hz)
# Create mel_bin_count linearly spaced values between these extreme mel values.
mel_lin_spaced = np.linspace(min_mels, max_mels, num=mel_bin_count)
# Map each of these mel values back into linear frequency (Hz).
center_frequencies_hz = np.array([mel_to_hz(n) for n in mel_lin_spaced])
mels_per_bin = float(max_mels - min_mels)/float(mel_bin_count - 1)
mels_start = min_mels - mels_per_bin
hz_start = mel_to_hz(mels_start)
fft_bin_start = hz_to_fft_bin(hz_start, sample_rate_hz, linear_bin_count)
#print('fft_bin_start: ', fft_bin_start)
mels_end = max_mels + mels_per_bin
hz_stop = mel_to_hz(mels_end)
fft_bin_stop = hz_to_fft_bin(hz_stop, sample_rate_hz, linear_bin_count)
#print('fft_bin_stop: ', fft_bin_stop)
# Map each center frequency to the closest fft bin index.
linear_bin_indices = np.array([hz_to_fft_bin(f_hz, sample_rate_hz, linear_bin_count) for f_hz in center_frequencies_hz])
# Create filterbank matrix.
filterbank = np.zeros((mel_bin_count, linear_bin_count))
for mel_bin in range(mel_bin_count):
center_freq_linear_bin = linear_bin_indices[mel_bin]
# Create a triangular filter having the current center freq.
# The filter will start with 0 response at left_bin (if it exists)
# and ramp up to 1.0 at center_freq_linear_bin, and then ramp
# back down to 0 response at right_bin (if it exists).
# Create the left side of the triangular filter that ramps up
# from 0 to a response of 1 at the center frequency.
if center_freq_linear_bin > 1:
# It is possible to create the left triangular filter.
if mel_bin == 0:
# Since this is the first center frequency, the left side
# must start ramping up from linear bin 0 or 1 mel bin before the center freq.
left_bin = max(0, fft_bin_start)
else:
# Start ramping up from the previous center frequency bin.
left_bin = linear_bin_indices[mel_bin - 1]
for f_bin in range(left_bin, center_freq_linear_bin+1):
if (center_freq_linear_bin - left_bin) > 0:
response = float(f_bin - left_bin)/float(center_freq_linear_bin - left_bin)
filterbank[mel_bin, f_bin] = response
# Create the right side of the triangular filter that ramps down
# from 1 to 0.
if center_freq_linear_bin < linear_bin_count-2:
# It is possible to create the right triangular filter.
if mel_bin == mel_bin_count - 1:
# Since this is the last mel bin, we must ramp down to response of 0
# at the last linear freq bin.
right_bin = min(linear_bin_count - 1, fft_bin_stop)
else:
right_bin = linear_bin_indices[mel_bin + 1]
for f_bin in range(center_freq_linear_bin, right_bin+1):
if (right_bin - center_freq_linear_bin) > 0:
response = float(right_bin - f_bin)/float(right_bin - center_freq_linear_bin)
filterbank[mel_bin, f_bin] = response
filterbank[mel_bin, center_freq_linear_bin] = 1.0
return filterbank
def stft_for_reconstruction(x, fft_size, hopsamp):
"""Compute and return the STFT of the supplied time domain signal x.
Args:
x (1-dim Numpy array): A time domain signal.
fft_size (int): FFT size. Should be a power of 2, otherwise DFT will be used.
hopsamp (int):
Returns:
The STFT. The rows are the time slices and columns are the frequency bins.
"""
window = np.hanning(fft_size)
fft_size = int(fft_size)
hopsamp = int(hopsamp)
return np.array([np.fft.rfft(window*x[i:i+fft_size])
for i in range(0, len(x)-fft_size, hopsamp)])
def istft_for_reconstruction(X, fft_size, hopsamp):
"""Invert a STFT into a time domain signal.
Args:
X (2-dim Numpy array): Input spectrogram. The rows are the time slices and columns are the frequency bins.
fft_size (int):
hopsamp (int): The hop size, in samples.
Returns:
The inverse STFT.
"""
fft_size = int(fft_size)
hopsamp = int(hopsamp)
window = np.hanning(fft_size)
time_slices = X.shape[0]
len_samples = int(time_slices*hopsamp + fft_size)
x = np.zeros(len_samples)
for n,i in enumerate(range(0, len(x)-fft_size, hopsamp)):
x[i:i+fft_size] += window*np.real(np.fft.irfft(X[n]))
return x
def get_signal(in_file, expected_fs=44100):
"""Load a wav file.
If the file contains more than one channel, return a mono file by taking
the mean of all channels.
If the sample rate differs from the expected sample rate (default is 44100 Hz),
raise an exception.
Args:
in_file: The input wav file, which should have a sample rate of `expected_fs`.
expected_fs (int): The expected sample rate of the input wav file.
Returns:
The audio siganl as a 1-dim Numpy array. The values will be in the range [-1.0, 1.0]. fixme ( not yet)
"""
fs, y = scipy.io.wavfile.read(in_file)
num_type = y[0].dtype
if num_type == 'int16':
y = y*(1.0/32768)
elif num_type == 'int32':
y = y*(1.0/2147483648)
elif num_type == 'float32':
# Nothing to do
pass
elif num_type == 'uint8':
raise Exception('8-bit PCM is not supported.')
else:
raise Exception('Unknown format.')
if fs != expected_fs:
raise Exception('Invalid sample rate.')
if y.ndim == 1:
return y
else:
return y.mean(axis=1)
def reconstruct_signal_griffin_lim(magnitude_spectrogram, fft_size, hopsamp, iterations):
"""Reconstruct an audio signal from a magnitude spectrogram.
Given a magnitude spectrogram as input, reconstruct
the audio signal and return it using the Griffin-Lim algorithm from the paper:
"Signal estimation from modified short-time fourier transform" by Griffin and Lim,
in IEEE transactions on Acoustics, Speech, and Signal Processing. Vol ASSP-32, No. 2, April 1984.
Args:
magnitude_spectrogram (2-dim Numpy array): The magnitude spectrogram. The rows correspond to the time slices
and the columns correspond to frequency bins.
fft_size (int): The FFT size, which should be a power of 2.
hopsamp (int): The hope size in samples.
iterations (int): Number of iterations for the Griffin-Lim algorithm. Typically a few hundred
is sufficient.
Returns:
The reconstructed time domain signal as a 1-dim Numpy array.
"""
time_slices = magnitude_spectrogram.shape[0]
len_samples = int(time_slices*hopsamp + fft_size)
# Initialize the reconstructed signal to noise.
x_reconstruct = np.random.randn(len_samples)
n = iterations # number of iterations of Griffin-Lim algorithm.
while n > 0:
n -= 1
reconstruction_spectrogram = stft_for_reconstruction(x_reconstruct, fft_size, hopsamp)
reconstruction_angle = np.angle(reconstruction_spectrogram)
# Discard magnitude part of the reconstruction and use the supplied magnitude spectrogram instead.
proposal_spectrogram = magnitude_spectrogram*np.exp(1.0j*reconstruction_angle)
prev_x = x_reconstruct
x_reconstruct = istft_for_reconstruction(proposal_spectrogram, fft_size, hopsamp)
diff = sqrt(sum((x_reconstruct - prev_x)**2)/x_reconstruct.size)
print('Reconstruction iteration: {}/{} RMSE: {} '.format(iterations - n, iterations, diff))
return x_reconstruct
def save_audio_to_file(x, sample_rate, outfile='out.wav'):
"""Save a mono signal to a file.
Args:
x (1-dim Numpy array): The audio signal to save. The signal values should be in the range [-1.0, 1.0].
sample_rate (int): The sample rate of the signal, in Hz.
outfile: Name of the file to save.
"""
x_max = np.max(abs(x))
assert x_max <= 1.0, 'Input audio value is out of range. Should be in the range [-1.0, 1.0].'
x = x*32767.0
data = array.array('h')
for i in range(len(x)):
cur_samp = int(round(x[i]))
data.append(cur_samp)
f = wave.open(outfile, 'w')
f.setparams((1, 2, sample_rate, 0, "NONE", "Uncompressed"))
f.writeframes(data.tostring())
f.close()