forked from megseekosh/Categorize_app_v2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_split_app_basel.py
418 lines (388 loc) · 18.5 KB
/
1_split_app_basel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
# When running this in terminal, you need to specify the following args:
# child_ID birth_date record_date gender
# in that order. eg 'python split_app_luca.py 1234 20221231 20230131 baby'
# gender does not seem to be used outside of record-keeping purposes
import sys
# set global variables. otherwise functions that use sys.argv elements
# as default variables will throw errors.
if __name__ == "__main__":
if len(sys.argv) < 5:
raise SyntaxError("""Not enough arguments supplied. Please re-run.
Next time, specify the following args:
child_ID birth_date record_date gender
in that order. eg 'python split_app_luca.py 1234 20221231 20230131 baby'""")
child_ID = sys.argv[1]
birth_date = sys.argv[2]
record_date = sys.argv[3]
gender = sys.argv[4]
else:
child_ID = None
birth_date = None
record_date = None
gender = None
# set the default value for 'globals' in main()
if "globals" in sys.argv:
globalVars = True
else:
globalVars = False
# For the file dialog
import tkinter as tk
from tkinter.filedialog import askopenfilename, askdirectory
from tkinter.messagebox import showinfo
# For the info output
import csv
import os
# These are used to split audio and tell its speech contents
from pydub import AudioSegment
from pydub.utils import mediainfo
from find_speech import vad_trial
# for merging segments
from scipy.optimize import minimize_scalar
import pandas as pd
from random import shuffle
# function to comb through a df and merge segments with the same label that are close in time
def merge_segments(df, max_distance, verbose=False, timed=False):
"""
df: a pandas dataframe with columns ['segtype', 'startsec', 'endsec', 'duration']
max_distance: maximum distance (in time) two segments can be from one another to be merged
Returns the df with segments merged according to max_distance.
Will add column 'checked_for_merge' to the original dataframe. Value is True if that row
was checked.
"""
if timed:
import time
start = time.time()
# don't want to change original df, so will modify/return a copy (newdf)
newdf = df.copy()
# make a reference df that will
# make a list to keep track of whether we've already checked/merged a given row
# only part of original df that will be modified
df['checked_for_merge'] = False
for row in df.itertuples():
if not df.at[row.Index, 'checked_for_merge']:
if verbose:
print("Looking at row " + str(row.Index))
# Merge this segment with any that have the same name, if they are
# within a tolerable duration
latest_startsec = row.endsec + max_distance
# look for segments with start times between this row's start time
# and this row's end time plus the maximum tolerable distance
# i.e. subsequent instances of the same segment that are 'close enough'
# (splitting the logic into multiple variables for the sake of legibility)
timemask = (newdf['startsec'] >= row.endsec) & (newdf['startsec'] <= latest_startsec)
segmask = newdf['segtype'] == row.segtype
matches = newdf[timemask & segmask]
# with those rows identified, start merging.
if verbose:
print(matches)
if len(matches) > 0: # if any rows match both masks
# figure out the new end duration of the merged segment
new_endsec = matches['endsec'].max()
# we will merge all entries from the start time of our original second to
# the end time of the last eligible segment we are merging
subdf = newdf[timemask & (newdf['endsec'] <= new_endsec)]
if verbose:
print(subdf)
# update the original entry with that new end duration
newdf.at[row.Index, 'endsec'] = new_endsec
# drop the rows we're merging from the new dataframe
newdf = newdf.drop(subdf.index)
# skip these rows if we would try to merge from them
for j in subdf.index:
df.at[j, 'checked_for_merge'] = True
df.at[row.Index, 'checked_for_merge'] = True
if timed:
end = time.time()
print('took ' + str(end-start) + ' seconds')
return newdf
def merge_adjacent_segments(df, max_distance=0, verbose=False):
"""
df: a pandas dataframe with columns ['segtype', 'startsec', 'endsec', 'duration']
max_distance: acceptable gap between two instances of the same segment
This function goes row by row; for each row, it checks if the prev row is within
max_distance seconds of the current row, and if they are, combines those two
entries into one row. If there are multiple rows that satisfy these criteria
they will be merged into one row.
Returns the df with all merges.
"""
# don't want to change original df, so will modify/return a copy (newdf)
newdf = df.copy()
# since indices were preserved from an original df, they may not be sequential
indices = df.index
for i in range(len(indices)):
if verbose:
print(i)
# i is the index of a dataframe's index. so indices[0] is the first index of the df
if i > 0:
# get the df index (i is the index of that index)
this_index = indices[i]
# get the latest possible start time
max_startsec = newdf.at[last_index, 'endsec'] + max_distance
# check if the segments match and times are sequential
mergeable = (newdf.at[last_index,'segtype'] == newdf.at[this_index, 'segtype']) and (newdf.at[this_index, 'startsec'] <= max_startsec)
if mergeable:
# set a new end time, merging into previous row
newdf.at[last_index, 'endsec'] = df.at[this_index, 'endsec']
# remove this row (as it is now merged into the prev one)
newdf = newdf.drop(this_index)
# don't update last_index - that index is still the one to compare against and merge into
else:
# get the previous df index
last_index = this_index
elif i == 0:
last_index = indices[i]
# recalculate durations
# this is more easily vectorized so should go fast
newdf['duration'] = newdf['endsec'] - newdf['startsec']
return newdf
# function to evaluate how close a merge gets you to a certain distance
def evaluate_merged_durations(df, desired_median, max_distance=0):
"""
df: a pandas dataframe with a 'duraton' column
desired_median: the median duration you want
max_distance: see help(merge_segments)
Runs merge_segments, calculates the new median duration of segments, and
returns its absolute value."""
newdf = merge_segments(df, max_distance)
new_median = newdf['duration'].median()
answer = abs(new_median - desired_median)
return answer
# optimize maximum distance
def optimize_merged_durations(dataframe, goal):
def function_to_optimize(try_distance, goal=goal, df=dataframe):
return evaluate_merged_durations(df, goal, max_distance=try_distance)
return minimize_scalar(function_to_optimize, bracket=(0, goal*2), bounds=(0,9999))
# not sure why split_file was kept in a separate file before
def split_file(source_file, output_directory, segments_dataframe,
child_ID=child_ID, record_date=record_date, verbose=True):
# This is the audio input which we will be splitting
sound = AudioSegment.from_file(source_file, format = 'wav')
# This is metadata about the file we will be splitting
data = mediainfo(source_file)
# prepare file name format
# figure out how many segments there are so we can prepend 0's
# following line gets how many rows there are, converts it to a string, then counts
# the number of characters in the string, which will tell us how many digits we need
# to reserve
n_segments = len(str(len(segments_dataframe)))
# should be like directory/childID_birthdate_rownumber
output_format = '%s/'+ child_ID + '_' + record_date + '_%0' + str(n_segments) + 'd'
# loop over the segments df and chop the audio file to match its segmentation
rows_used = []
for row in segments_dataframe.itertuples():
# if this is a segment of interest, split it
# identify time stamps
start_ms = float(row.startsec) * 1000 # convert to ms
end_ms = float(row.endsec) * 1000
# generate file name
this_filepath = output_format % (output_directory, row.Index)
this_filepath = this_filepath + ".wav"
if verbose:
print(this_filepath)
# remember that we are using this row (ie it's not filtered out)
rows_used.append([row.Index, this_filepath])
# split out the audio file
split_sound = sound[start_ms:end_ms]
split_sound.export(this_filepath,
format="wav",
bitrate=data["bit_rate"])
print("Split into " + str(len(rows_used)) + " clips")
return rows_used
def main(child_ID=sys.argv[1], birth_date=sys.argv[2], record_date=sys.argv[3], gender=sys.argv[4], globals=globalVars):
"""
This is the user-oriented function which
1. Asks the user for an audio file to process
2. Asks the user for an output directory
3. Splits the audio file using split_file()
4. Writes the percent of vocal content found in each file to a .csv
called config.csv in the output directory
globals: set to True to make most variables global (eg for inspection)
"""
# Tell the user what to do
showinfo('Window', "Select an audio file to cut apart")
# Get audio file
if globals:
global audio_file
audio_file = askopenfilename(
filetypes =(("Audio File", "*.wav"),("all files","*.*")),
title = "Please choose a .wav file"
)
print(audio_file)
# If no file is selected
if audio_file == '':
return False
# Tell the user what to do
showinfo('Window', "Select the CSV file output by segments.pl")
if globals:
global segments_csv
segments_csv = askopenfilename(
filetypes =(("CSV File", "*.csv"),("all files","*.*")),
title = "Please choose a .csv file generated by segments.pl"
)
print(segments_csv)
#if the directory was not selected
if segments_csv == '':
return False
# Tell the user what to do
showinfo('Window', "Select an output directory to save the audio chunks")
# Select the output directory
if globals:
global output_dir
output_dir = askdirectory()
print(output_dir)
#if the output directory was not selected
if output_dir == '':
return False
# ask user to filter LENA segments
unknown_preference = True
# while loop means we don't crash if user gives input we don't like
print() # line break to clearly separate this msg
while unknown_preference:
print("I can filter to only certain LENA annotations, or all but certain ones.")
print("Please select the option you prefer:")
print("1: Specify which LENA annotations to INCLUDE")
print("2: Specify which LENA annotations to EXCLUDE")
lena_selection_method = input("Your choice: ")
if lena_selection_method in ("1", "2"):
unknown_preference = False
if lena_selection_method == "1":
lena_selection_method = 'include'
elif lena_selection_method == "2":
lena_selection_method = 'exclude'
else:
print()
print("Sorry, you need to type 1 or 2 and then press enter. Let's try again.")
# check which LENA annotations to include
if globals:
global label_choices
if lena_selection_method == 'include':
print("Please type which LENA labels to include, separated by a comma. For example,"
" typing 'FAN,MAN' (without quotes) will filter to Female Adult Near and Male Adult Near"
" annotations.")
label_choices = input("Labels to include: ")
elif lena_selection_method == 'exclude':
print("Please type which LENA labels to exclude, separated by a comma. For example,"
" typing 'FAN,MAN' (without quotes) will filter out Female Adult Near and Male Adult Near"
" annotations.")
label_choices = input("Labels to exclude: ")
# change text to uppercase in case of typo
label_choices = label_choices.upper()
# process label_choices from string to list
label_choices = label_choices.split(',')
# ask how much of a gap between segments is allowed
print()
print("I'll merge any sequential segments for you. How much of a gap should I allow?")
tolerable_distance = input('Longest permissible gap (in seconds): ')
# since we're already prompting here, also check what percentage of voicing is OK
print("What's the minimum percentage of voicing you want? Clips with a lower percentage will be excluded.")
minimum_voicing_percent = input("Minimum voicing percentage (eg '0.1' for 10%): ")
minimum_voicing_percent = float(minimum_voicing_percent)
# convert to number
if '.' in tolerable_distance:
tolerable_distance = float(tolerable_distance)
else:
tolerable_distance = int(tolerable_distance)
# and ask for minimum duration
print("You want clips no shorter than...")
minimum_duration = input('Minimum clip duration (in seconds): ')
minimum_duration = float(minimum_duration)
# read the CSV output of segments.pl
print("Reading segments data...")
with open(segments_csv, newline='') as csvfile:
segments_reader = csv.DictReader(csvfile, delimiter=',')
segments_data = [row for row in segments_reader]
# convert segments_data to better types, add duration
for row in segments_data:
row['startsec'] = float(row['startsec'])
row['endsec'] = float(row['endsec'])
row['duration'] = row['endsec'] - row['startsec']
# make that into a pandas dataframe - this allows easier data management
if globals:
global segments_df
segments_df = pd.DataFrame(segments_data)
# enforce minimum duration
print("Enforcing minimum duration...")
segments_df = segments_df.loc[segments_df['duration'] >= minimum_duration]
print("")
# filter to interesting segments
print("Filtering segments with Lena...")
# loop over the segments csv and chop the audio file to match its segmentation
# limiting to choices made by user
# add a column to the dataframe for which segments are interesting
segments_df['interesting'] = False
# since I want to loop over indices, reset them to be sure they're sequential from 0
segments_df.reset_index(inplace=True)
for i in range(len(segments_df)):
row = segments_df.iloc[i]
# filter according to user input
if lena_selection_method == 'include':
if row.segtype in label_choices:
segments_df.at[i,'interesting'] = True
elif lena_selection_method == 'exclude':
if row.segtype not in label_choices:
segments_df.at[i,'interesting'] = True
# filter down to segments identified as interesting
segments_df = segments_df.loc[segments_df['interesting']]
# no further need for the 'interesting' column
segments_df = segments_df.drop(columns='interesting')
print("")
# merge segments
# first merge any segments that are already adjacent until there are no more
print('Merging segments...')
segments_df = merge_adjacent_segments(segments_df, max_distance=tolerable_distance)
print("")
# Gather metadata
print("Gathering metadata...")
if globals:
global rows_used
rows_used = split_file(audio_file, output_dir, segments_df,
child_ID=child_ID, record_date=record_date)
if globals:
global clips_info
clips_info = [] # this was originally called percents
rejected_clips = []
for i in range(len(rows_used)):
# rows_used[i] is some info about a row. rows_used[i][0] is row info from segments.csv
# rows_used[i][1] is the filepath of the audio that was split out for that segment
filepath = rows_used[i][1]
# categorize_app expects only the file name, not full path
filename_for_csv = os.path.basename(filepath)
row_index = rows_used[i][0]
vad_percents = vad_trial(filepath)
start_time = segments_df.at[row_index, 'startsec'] * 1000 # convert to ms
end_time = segments_df.at[row_index, 'endsec'] * 1000
duration = segments_df.at[row_index, 'duration']
# clips_info will become a CSV with these columns:
# file name, id, age, date, gender, start time, percents, end time, duration of clip
row = [filename_for_csv, child_ID, birth_date, record_date, gender, start_time, vad_percents, end_time, duration]
# continue if this meets the voicing requirement, otherwise put it in the reject pile
if vad_percents > minimum_voicing_percent:
clips_info.append(row)
else:
rejected_clips.append(row)
# randomize the order of rows
shuffle(clips_info)
# Write the config.csv data log file
print("Writing the csv")
with open(os.path.join(output_dir, "config.csv"), 'w') as output:
writer = csv.writer(output, delimiter=',')
#this makes the header for the rows
writer.writerow(['file_name', 'id', 'age_YYMMDD', 'date_YYYYMMDD', 'gender', 'start_time', 'percents_voc', 'end_time', 'duration', 'researcher_present', 'sleeping'])
#information for the rows
#percents actually contains everything for all information
writer.writerows(clips_info)
# And the rejects.csv file
with open(os.path.join(output_dir, "rejects.csv"), 'w') as output:
writer = csv.writer(output, delimiter=',')
#this makes the header for the rows
writer.writerow(['file_name', 'id', 'age_YYMMDD', 'date_YYYYMMDD', 'gender', 'start_time', 'percents_voc', 'end_time', 'duration', 'researcher_present', 'sleeping'])
#information for the rows
writer.writerows(rejected_clips)
return output_dir
# Running the script
if __name__ == '__main__':
# Make sure the general tk window does not appear
tk.Tk().withdraw()
# Run the program
output_directory = main() # returns output directory
print("Job's done! Output is at " + output_directory)