-
Notifications
You must be signed in to change notification settings - Fork 78
/
generator.py
399 lines (310 loc) · 13.7 KB
/
generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
import numpy as np
from numpy.lib.stride_tricks import as_strided
from sklearn.utils import shuffle
import python_speech_features as p
import scipy.io.wavfile as wav
import soundfile
import scipy
from aubio import source, pvoc, mfcc
from numpy import vstack, zeros, diff
from keras.preprocessing.sequence import pad_sequences
from utils import text_to_int_sequence
# Data batch generator, responsible for providing the data to fit_generator
class BatchGenerator(object):
def __init__(self, dataframe, dataproperties, training, batch_size=16, model_input_type="mfcc"):
self.training_data = training
self.model_input_type = model_input_type ##mfcc, mfcc-aubio, spectrogram, spectrogram-img
# self.aubio = False
self.df = dataframe.copy()
#['wav_filesize','transcript','wav_filename']
self.wavpath = self.df['wav_filename'].tolist()
self.transcript = self.df['transcript'].tolist()
self.finish = self.df['wav_filesize'].tolist()
self.start = np.zeros(len(self.finish))
self.length = self.finish
self.shuffling = True
self.batch_size = batch_size
self.cur_index = 0
self.feats_std = 0
self.feats_mean = 0
self.set_of_all_int_outputs_used = None
#Free up memory of unneeded data
del dataframe
del dataproperties
self.df = None
del self.df
def normalise(self, feature, eps=1e-14):
return (feature - self.feats_mean) / (self.feats_std + eps)
def get_batch(self, idx):
batch_x = self.wavpath[idx * self.batch_size:(idx + 1) * self.batch_size]
batch_y_trans = self.transcript[idx * self.batch_size:(idx + 1) * self.batch_size]
try:
assert (len(batch_x) == self.batch_size)
assert (len(batch_y_trans) == self.batch_size)
except Exception as e:
print(e)
print(batch_x)
print(batch_y_trans)
# 1. X_data (the MFCC's for the batch)
if(self.model_input_type == "spectrogram"):
# 0. get the maximum time length of the batch
x_val = [get_max_specto_time(file_name) for file_name in batch_x]
max_val = max(x_val)
# print("Max batch time value is:", max_val)
X_data = np.array([make_specto_shape(file_name, padlen=max_val) for file_name in batch_x])
assert (X_data.shape == (self.batch_size, max_val, 161))
elif(self.model_input_type == "mfcc-aubio"):
x_val = [get_max_aubio(file_name) for file_name in batch_x]
max_val = max(x_val)
# print("Max batch time value is:", max_val)
X_data = np.array([make_aubio_shape(file_name, padlen=max_val) for file_name in batch_x])
# print(X_data.shape)
assert (X_data.shape == (self.batch_size, max_val, 26))
elif(self.model_input_type == "mfcc"):
# 0. get the maximum time length of the batch
x_val = [get_max_time(file_name) for file_name in batch_x]
max_val = max(x_val)
# print("Max batch time value is:", max_val)
X_data = np.array([make_mfcc_shape(file_name, padlen=max_val) for file_name in batch_x])
assert (X_data.shape == (self.batch_size, max_val, 26))
# print("1. X_data shape:", X_data.shape)
# print("1. X_data:", X_data)
# 2. labels (made numerical)
# get max label length
y_val = [get_maxseq_len(l) for l in batch_y_trans]
# print(y_val)
max_y = max(y_val)
# print(max_y)
labels = np.array([get_intseq(l, max_intseq_length=max_y) for l in batch_y_trans])
# print("2. labels shape:", labels.shape)
# print("2. labels values=", labels)
assert(labels.shape == (self.batch_size, max_y))
# 3. input_length (required for CTC loss)
# this is the time dimension of CTC (batch x time x mfcc)
#input_length = np.array([get_xsize(mfcc) for mfcc in X_data])
input_length = np.array(x_val)
# print("3. input_length shape:", input_length.shape)
# print("3. input_length =", input_length)
assert(input_length.shape == (self.batch_size,))
# 4. label_length (required for CTC loss)
# this is the length of the number of label of a sequence
#label_length = np.array([len(l) for l in labels])
label_length = np.array(y_val)
# print("4. label_length shape:", label_length.shape)
# print("4. label_length =", label_length)
assert(label_length.shape == (self.batch_size,))
# 5. source_str (used for human readable output on callback)
source_str = np.array([l for l in batch_y_trans])
inputs = {
'the_input': X_data,
'the_labels': labels,
'input_length': input_length,
'label_length': label_length,
'source_str': source_str
}
outputs = {'ctc': np.zeros([self.batch_size])}
return (inputs, outputs)
def next_batch(self):
while 1:
assert (self.batch_size <= len(self.wavpath))
if (self.cur_index + 1) * self.batch_size >= len(self.wavpath) - self.batch_size:
self.cur_index = 0
if(self.shuffling==True):
print("SHUFFLING as reached end of data")
self.genshuffle()
try:
ret = self.get_batch(self.cur_index)
except:
print("data error - this shouldn't happen - try next batch")
self.cur_index += 1
ret = self.get_batch(self.cur_index)
self.cur_index += 1
yield ret
def genshuffle(self):
self.wavpath, self.transcript, self.finish = shuffle(self.wavpath,
self.transcript,
self.finish)
def export_test_mfcc(self):
# this is used to export data e.g. into iOS
testset = next(self.next_batch())[0]
mfcc = testset['the_input'][0:self.batch_size] ## export all mfcc's in batch #26 x 29 ?
words = testset['source_str'][0:self.batch_size]
labels = testset['the_labels'][0:self.batch_size]
print("exporting:", type(mfcc))
print(mfcc.shape)
print(words.shape)
print(labels.shape)
# we save each mfcc/words/label as it's own csv file
for i in range(0, mfcc.shape[0]):
np.savetxt('./Archive/test_spectro/test_spectro_{}.csv'.format(i), mfcc[i, :, :], delimiter=',')
print(words)
print(labels)
return
def get_normalise(self, k_samples=100):
# todo use normalise from DS2 - https://github.com/baidu-research/ba-dls-deepspeech
""" Estimate the mean and std of the features from the training set
Params:
k_samples (int): Use this number of samples for estimation
"""
# k_samples = min(k_samples, len(self.train_audio_paths))
# samples = self.rng.sample(self.train_audio_paths, k_samples)
# feats = [self.featurize(s) for s in samples]
# feats = np.vstack(feats)
# self.feats_mean = np.mean(feats, axis=0)
# self.feats_std = np.std(feats, axis=0)
pass
def get_maxseq_len(trans):
# PAD
t = text_to_int_sequence(trans)
return len(t)
def get_intseq(trans, max_intseq_length=80):
# PAD
t = text_to_int_sequence(trans)
while (len(t) < max_intseq_length):
t.append(27) # replace with a space char to pad
# print(t)
return t
def get_max_time(filename):
fs, audio = wav.read(filename)
r = p.mfcc(audio, samplerate=fs, numcep=26) # 2D array -> timesamples x mfcc_features
# print(r.shape)
return r.shape[0] #
def get_max_specto_time(filename):
r = spectrogram_from_file(filename)
# print(r.shape)
return r.shape[0] #
def get_max_aubio(filename):
r = aubio(filename) # 2D array -> timesamples x mfcc_features
# print(r.shape)
return r.shape[0] #
def make_specto_shape(filename, padlen=778):
r = spectrogram_from_file(filename)
t = np.transpose(r) # 2D array -> spec x timesamples
X = pad_sequences(t, maxlen=padlen, dtype='float', padding='post', truncating='post').T
return X # MAXtimesamples x specto {max x 161}
def make_aubio_shape(filename, padlen=778):
r = aubio(filename)
t = np.transpose(r) # 2D array -> mfcc_features x timesamples
X = pad_sequences(t, maxlen=padlen, dtype='float', padding='post', truncating='post').T
return X # 2D array -> MAXtimesamples x mfcc_features {778 x 26}
def make_mfcc_shape(filename, padlen=778):
fs, audio = wav.read(filename)
r = p.mfcc(audio, samplerate=fs, numcep=26) # 2D array -> timesamples x mfcc_features
t = np.transpose(r) # 2D array -> mfcc_features x timesamples
X = pad_sequences(t, maxlen=padlen, dtype='float', padding='post', truncating='post').T
return X # 2D array -> MAXtimesamples x mfcc_features {778 x 26}
def get_xsize(val):
return val.shape[0]
def shuffle_data(self):
self.wavpath, self.transcript, self.finish = shuffle(self.wavpath,
self.transcript,
self.finish)
return
def aubio(source_filename):
# print("Usage: %s <source_filename> [samplerate] [win_s] [hop_s] [mode]" % sys.argv[0])
# print(" where [mode] can be 'delta' or 'ddelta' for first and second derivatives")7
n_filters = 40 # must be 40 for mfcc
n_coeffs = 26
win_s = 512
hop_s = win_s // 4
# mode = "default"
samplerate = 0
s = source(source_filename, samplerate, hop_s)
samplerate = s.samplerate
p = pvoc(win_s, hop_s)
m = mfcc(win_s, n_filters, n_coeffs, samplerate)
mfccs = zeros([n_coeffs, ])
frames_read = 0
while True:
samples, read = s()
spec = p(samples)
mfcc_out = m(spec)
mfccs = vstack((mfccs, mfcc_out))
frames_read += read
if read < hop_s: break
return mfccs
##Require for DS2 - source: https://github.com/baidu-research/ba-dls-deepspeech
##############################################################################
def spectrogram(samples, fft_length=256, sample_rate=2, hop_length=128):
"""
Compute the spectrogram for a real signal.
The parameters follow the naming convention of
matplotlib.mlab.specgram
Args:
samples (1D array): input audio signal
fft_length (int): number of elements in fft window
sample_rate (scalar): sample rate
hop_length (int): hop length (relative offset between neighboring
fft windows).
Returns:
x (2D array): spectrogram [frequency x time]
freq (1D array): frequency of each row in x
Note:
This is a truncating computation e.g. if fft_length=10,
hop_length=5 and the signal has 23 elements, then the
last 3 elements will be truncated.
"""
assert not np.iscomplexobj(samples), "Must not pass in complex numbers"
window = np.hanning(fft_length)[:, None]
window_norm = np.sum(window**2)
# The scaling below follows the convention of
# matplotlib.mlab.specgram which is the same as
# matlabs specgram.
scale = window_norm * sample_rate
trunc = (len(samples) - fft_length) % hop_length
x = samples[:len(samples) - trunc]
# "stride trick" reshape to include overlap
nshape = (fft_length, (len(x) - fft_length) // hop_length + 1)
nstrides = (x.strides[0], x.strides[0] * hop_length)
x = as_strided(x, shape=nshape, strides=nstrides)
# window stride sanity check
assert np.all(x[:, 1] == samples[hop_length:(hop_length + fft_length)])
# broadcast window, compute fft over columns and square mod
x = np.fft.rfft(x * window, axis=0)
x = np.absolute(x)**2
# scale, 2.0 for everything except dc and fft_length/2
x[1:-1, :] *= (2.0 / scale)
x[(0, -1), :] /= scale
freqs = float(sample_rate) / fft_length * np.arange(x.shape[0])
return x, freqs
def spectrogram_from_file(filename, step=10, window=20, max_freq=None,
eps=1e-14):
""" Calculate the log of linear spectrogram from FFT energy
Params:
filename (str): Path to the audio file
step (int): Step size in milliseconds between windows
window (int): FFT window size in milliseconds
max_freq (int): Only FFT bins corresponding to frequencies between
[0, max_freq] are returned
eps (float): Small value to ensure numerical stability (for ln(x))
"""
with soundfile.SoundFile(filename) as sound_file:
audio = sound_file.read(dtype='float32')
sample_rate = sound_file.samplerate
if audio.ndim >= 2:
audio = np.mean(audio, 1)
if max_freq is None:
max_freq = sample_rate / 2
if max_freq > sample_rate / 2:
raise ValueError("max_freq must not be greater than half of "
" sample rate")
if step > window:
raise ValueError("step size must not be greater than window size")
hop_length = int(0.001 * step * sample_rate)
fft_length = int(0.001 * window * sample_rate)
pxx, freqs = spectrogram(
audio, fft_length=fft_length, sample_rate=sample_rate,
hop_length=hop_length)
ind = np.where(freqs <= max_freq)[0][-1] + 1
return np.transpose(np.log(pxx[:ind, :] + eps))
def featurise(audio_clip):
""" For a given audio clip, calculate the log of its Fourier Transform
Params:
audio_clip(str): Path to the audio clip
"""
step = 10
window = 20
max_freq = 8000
return spectrogram_from_file(
audio_clip, step=step, window=window,
max_freq=max_freq)