8
8
import logging
9
9
import os
10
10
import sys
11
+ from typing import List
11
12
from concurrent .futures import ProcessPoolExecutor
12
13
from pathlib import Path
13
14
@@ -31,7 +32,15 @@ def get_parser():
31
32
parser .add_argument ("--data-dir" , default = "data" , type = str , help = "data directory" )
32
33
parser .add_argument ("--seed" , default = 1 , type = int , help = "random seed" )
33
34
parser .add_argument (
34
- "--nj" , default = 1 , type = int , help = "number of jobs for features extraction"
35
+ "--num-jobs" , default = 1 , type = int , help = "number of jobs for features extraction"
36
+ )
37
+ parser .add_argument (
38
+ "--max-remaining-duration" , default = 0.3 , type = float ,
39
+ help = "not split if the left-over duration is less than this many seconds"
40
+ )
41
+ parser .add_argument (
42
+ "--overlap-duration" , default = 0.3 , type = float ,
43
+ help = "overlap between adjacent segments while splitting negative recordings"
35
44
)
36
45
# fmt: on
37
46
@@ -41,7 +50,9 @@ def get_parser():
41
50
def main (args ):
42
51
try :
43
52
# TODO use pip install once it's available
44
- from espresso .tools .lhotse import CutSet , Mfcc , MfccConfig , LilcomFilesWriter , WavAugmenter
53
+ from espresso .tools .lhotse import (
54
+ CutSet , Mfcc , MfccConfig , LilcomFilesWriter , SupervisionSet , WavAugmenter
55
+ )
45
56
from espresso .tools .lhotse .manipulation import combine
46
57
from espresso .tools .lhotse .recipes .mobvoihotwords import download_and_untar , prepare_mobvoihotwords
47
58
except ImportError :
@@ -68,36 +79,46 @@ def main(args):
68
79
np .random .seed (args .seed )
69
80
# equivalent to Kaldi's mfcc_hires config
70
81
mfcc = Mfcc (config = MfccConfig (num_mel_bins = 40 , num_ceps = 40 , low_freq = 20 , high_freq = - 400 ))
71
- num_jobs = args .nj
72
82
for partition , manifests in mobvoihotwords_manifests .items ():
73
83
cut_set = CutSet .from_manifests (
74
84
recordings = manifests ["recordings" ],
75
85
supervisions = manifests ["supervisions" ],
76
86
)
77
87
sampling_rate = next (iter (cut_set )).sampling_rate
78
- with ProcessPoolExecutor (num_jobs ) as ex :
88
+ with ProcessPoolExecutor (args . num_jobs ) as ex :
79
89
if "train" in partition :
80
- # original set
81
- with LilcomFilesWriter (f"{ output_dir } /feats_{ partition } _orig" ) as storage :
82
- cut_set_orig = cut_set .compute_and_store_features (
90
+ # split negative recordings into smaller chunks with lengths sampled from
91
+ # length distribution of positive recordings
92
+ pos_durs = get_positive_durations (manifests ["supervisions" ])
93
+ with numpy_seed (args .seed ):
94
+ cut_set = keep_positives_and_split_negatives (
95
+ cut_set ,
96
+ pos_durs ,
97
+ max_remaining_duration = args .max_remaining_duration ,
98
+ overlap_duration = args .overlap_duration ,
99
+ )
100
+ # "clean" set
101
+ with LilcomFilesWriter (f"{ output_dir } /feats_{ partition } _clean" ) as storage :
102
+ cut_set_clean = cut_set .compute_and_store_features (
83
103
extractor = mfcc ,
84
104
storage = storage ,
85
105
augmenter = None ,
86
106
executor = ex ,
87
107
)
88
- # augmented with reverbration
89
- with LilcomFilesWriter (f"{ output_dir } /feats_{ partition } _rev" ) as storage :
90
- cut_set_rev = cut_set .compute_and_store_features (
91
- extractor = mfcc ,
92
- storage = storage ,
93
- augmenter = WavAugmenter (effect_chain = reverb ()),
94
- excutor = ex ,
95
- )
108
+ # augmented with reverberation
109
+ with LilcomFilesWriter (f"{ output_dir } /feats_{ partition } _rev" ) as storage :
110
+ with numpy_seed (args .seed ):
111
+ cut_set_rev = cut_set .compute_and_store_features (
112
+ extractor = mfcc ,
113
+ storage = storage ,
114
+ augmenter = WavAugmenter (effect_chain = reverb ()),
115
+ excutor = ex ,
116
+ )
96
117
cut_set_rev = CutSet .from_cuts (
97
118
cut .with_id ("rev-" + cut .id ) for cut in cut_set_rev .cuts
98
119
)
99
120
# augmented with speed perturbation
100
- with LilcomFilesWriter (f"{ output_dir } /feats_{ partition } _sp1.1" ) as storage :
121
+ with LilcomFilesWriter (f"{ output_dir } /feats_{ partition } _sp1.1" ) as storage :
101
122
cut_set_sp1p1 = cut_set .compute_and_store_features (
102
123
extractor = mfcc ,
103
124
storage = storage ,
@@ -109,7 +130,7 @@ def main(args):
109
130
cut_set_sp1p1 = CutSet .from_cuts (
110
131
cut .with_id ("sp1.1-" + cut .id ) for cut in cut_set_sp1p1 .cuts
111
132
)
112
- with LilcomFilesWriter (f"{ output_dir } /feats_{ partition } _sp0.9" ) as storage :
133
+ with LilcomFilesWriter (f"{ output_dir } /feats_{ partition } _sp0.9" ) as storage :
113
134
cut_set_sp0p9 = cut_set .compute_and_store_features (
114
135
extractor = mfcc ,
115
136
storage = storage ,
@@ -121,9 +142,9 @@ def main(args):
121
142
cut_set_sp0p9 = CutSet .from_cuts (
122
143
cut .with_id ("sp0.9-" + cut .id ) for cut in cut_set_sp0p9 .cuts
123
144
)
124
- # combine the original and augmented sets together
145
+ # combine the clean and augmented sets together
125
146
cut_set = combine (
126
- cut_set_orig , cut_set_rev , cut_set_sp1p1 , cut_set_sp0p9
147
+ cut_set_clean , cut_set_rev , cut_set_sp1p1 , cut_set_sp0p9
127
148
)
128
149
else : # no augmentations for dev and test sets
129
150
with LilcomFilesWriter (f"{ output_dir } /feats_{ partition } " ) as storage :
@@ -137,6 +158,80 @@ def main(args):
137
158
cut_set .to_json (output_dir / f"cuts_{ partition } .json.gz" )
138
159
139
160
161
+ def get_positive_durations (sup_set : SupervisionSet ) -> List [float ]:
162
+ """
163
+ Get duration values of all positive recordings. Assume Supervison.text is
164
+ "FREETEXT" for all negative recordings, and SupervisionSegment.duration
165
+ equals to the corresponding Recording.duration.
166
+ """
167
+ return [sup .dur for sup in sup_set .filter (lambda seg : seg .text != "FREETEXT" )]
168
+
169
+
170
+ def keep_positives_and_split_negatives (
171
+ cut_set : CutSet ,
172
+ durations : List [float ],
173
+ max_remaining_duration : float = 0.3 ,
174
+ overlap_duration : float = 0.3 ,
175
+ ) -> CutSet :
176
+ """
177
+ Returns a new CutSet where all the positives are directly taken from the original
178
+ input cut set, and the negatives are obtained by splitting original negatives
179
+ into shorter chunks of random lengths drawn from the given length distribution
180
+ (here it is the empirical distribution of the positive recordings), There can
181
+ be overlap between chunks.
182
+
183
+ Args:
184
+ cut_set (CutSet): original input cut set
185
+ durations (list[float]): list of durations to sample from
186
+ max_remaining_duration (float, optional): not split if the left-over
187
+ duration is less than this many seconds (default: 0.3).
188
+ overlap_duration (float, optional): overlap between adjacent segments
189
+ (default: None)
190
+
191
+ Returns:
192
+ CutSet: a new cut set after split
193
+ """
194
+ assert max_remaining_duration >= 0.0 and overlap_duration >= 0.0
195
+ new_cuts = []
196
+ for cut in cut_set :
197
+ assert len (cut .supervisions ) == 1
198
+ if cut .supervisions [0 ].text != "FREETEXT" : # keep the positive as it is
199
+ new_cuts .append (cut )
200
+ else :
201
+ this_offset = cut .start
202
+ this_offset_relative = this_offset - cut .start
203
+ remaining_duration = cut .duration
204
+ this_dur = durations [np .random .randint (len (durations ))]
205
+ while remaining_duration > this_dur + max_remaining_duration :
206
+ new_cut = cut .truncate (
207
+ offset = this_offset_relative , duration = this_dur , preserve_id = True
208
+ )
209
+ new_cut = new_cut .with_id (
210
+ "{id}-{s:07d}-{e:07d}" .format (
211
+ id = new_cut .id ,
212
+ s = int (round (100 * this_offset_relative )),
213
+ e = int (round (100 * (this_offset_relative + this_dur )))
214
+ )
215
+ )
216
+ new_cuts .append (new_cut )
217
+ this_offset += this_dur - overlap_duration
218
+ this_offset_relative = this_offset - cut .start
219
+ remaining_duration -= this_dur - overlap_duration
220
+ this_dur = durations [np .random .randint (len (durations ))]
221
+
222
+ new_cut = cut .truncate (offset = this_offset_relative , preserve_id = True )
223
+ new_cut = new_cut .with_id (
224
+ "{id}-{s:07d}-{e:07d}" .format (
225
+ id = new_cut .id ,
226
+ s = int (round (100 * this_offset_relative )),
227
+ e = int (round (100 * cut .duration ))
228
+ )
229
+ )
230
+ new_cuts .append (new_cut )
231
+
232
+ return CutSet .from_cuts (new_cuts )
233
+
234
+
140
235
def reverb (* args , ** kwargs ):
141
236
"""
142
237
Returns a reverb effect for wav augmentation.
0 commit comments