-
Notifications
You must be signed in to change notification settings - Fork 1
/
util.py
executable file
·814 lines (687 loc) · 29.4 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
import music21
from music21 import converter, corpus, instrument, midi, note, chord, pitch, stream
from PIL import Image
from tqdm import tqdm
from reference import *
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import numpy as np
import sys
import os
import copy
from sklearn.metrics.pairwise import cosine_similarity
''' SAVE DIRECTORIES '''
# Where input MIDI files are stored
INPUT_MIDI_DIR = 'input_midi/'
# Where input MIDI images are stored
INPUT_IMAGE_DIR = 'input_images/'
# Where input numpy arrays are stored
INPUT_NUMPY_DIR = 'input_numpy/'
# Where input text is stored (e.g. for GPT)
INPUT_TEXT_DIR = 'input_text/'
# Where output MIDI files are stored
OUTPUT_MIDI_DIR = 'output_midi/'
# Where keras models are stored
MODEL_SAVE_DIR = 'keras_models/'
# Where entire corpuses are stored
NUMPY_CORPUS_DIR = 'numpy_corpus/'
# Where generated images are stored
OUTPUT_IMAGE_DIR = 'output_images/'
''' OTHER CONSTANTS '''
# Quarter note scale (1 quarter note is split up into 12 pixels for accuracy)
SCALE = 6
# Maximum velocity
MAX_VEL = 127
# Highest key
MAX_KEY = 127
# Highest pixel value (greyscale and RGB)
MAX_PIXEL_STRENGTH = 255
# Number of piano keys
NUM_KEYS = 88
# np.set_printoptions(threshold=sys.maxsize)
def load_numpy(fname):
'''Load a numpy file in the numpy save dir
Details:
fname: filename (in INPUT_NUMPY_DIR)
returns: numpy array'''
return np.load(INPUT_NUMPY_DIR + fname)
def open_midi(midi_path, prefix=''):
'''Open a midi file and return a Stream object
Details:
midi_path: MIDI filename (in INPUT_MIDI_DIR)
returns: Stream object'''
mf = midi.MidiFile()
mf.open(prefix + INPUT_MIDI_DIR + midi_path)
mf.read()
mf.close()
return midi.translate.midiFileToStream(mf)
def key_properties(nt, off=-1):
'''Get the properties of a particular note (pitch, offset, duration, and velocity)
Details:
nt: a Note object
off: offset (-1 means find it yourself, uses offset explicitly if given)
returns: an array of values'''
# Use a specific offset (useful for chords, where the chord has an offset but the notes do not)
if off != -1:
return [nt.pitch.midi, off, nt.duration.quarterLength, nt.volume.velocity]
else:
return [nt.pitch.midi, nt.offset, nt.duration.quarterLength, nt.volume.velocity]
def extract_relevant_data(elements, verbose=False):
'''Takes a bunch of elements and returns the normalized versions
Details:
elements: a collection of notes and chords
returns: a list of notes (an array of arrays)'''
all_notes = []
for element in elements:
# Treat note
if isinstance(element, note.Note):
all_notes.append(key_properties(element))
# Treat chord
elif isinstance(element, chord.Chord):
# Get chord offset
off = element.offset
# Treat each note in chord
for nt in element.notes:
all_notes.append(key_properties(nt, off=off))
# Normalize
all_notes = normalize(all_notes, verbose=verbose)
return all_notes
def normalize(all_notes, verbose=False):
'''Some standardization (scale, translate so it starts at 0, round to int)
Details:
all_notes: an array of notes represented by arrays
returns: an array of notes represented by arrays'''
# Scale all note offsets and durations
for nt in all_notes:
nt[1] = nt[1] * SCALE
nt[2] = nt[2] * SCALE
offsets = [nt[1] for nt in all_notes]
minimum = min(offsets)
# Translate and cast to int
for nt in all_notes:
nt[0] = int(max(0, nt[0]))
nt[1] = int(nt[1] - minimum)
nt[2] = int(max(1, nt[2]))
nt[3] = int(nt[3])
# Key normalize
all_notes = normalize_key(all_notes, verbose=verbose)
return all_notes
def normalize_key(all_notes, verbose=False):
'''Predicts the key from note distribution and transposes to C
Details:
all_notes: an array of notes represented by arrays
returns: an array of notes represented by arrays'''
frequencies = get_frequencies(all_notes)
key = detect_key(frequencies, verbose=verbose)
if key > 6:
key = 12 - key
all_notes = transpose(all_notes, -key)
return all_notes
def get_frequencies(all_notes):
'''Counts the frequency of each letter note
Details:
all_notes: an array of notes represented by arrays
returns: a array containing the frequency of each note (0:C ... 11:B)'''
note_frequencies = np.zeros(12)
# loop through all_notes and tally the number of times each note occurs
for note in all_notes:
# First element of note is pitch
pitch = note[0]
# C1 = 24 and 24 % 12 = 0 so this works out
scale_degree = pitch % 12
# Add to dictionary
note_frequencies[scale_degree] += 1
# Divide every value in note_frequencies by the total number of notes to get the frequency
for i in range(12):
note_frequencies[i] /= len(all_notes)
return note_frequencies
def transpose(all_notes, transpose_amt):
'''Transposes (shifts up/down) all notes by a given amount
Details:
all_notes: an array of notes represented by arrays
transpose_amt: integer representing number of semitones to transpose by (+ for up, - for down)
returns: an array of notes represented by arrays'''
for note in all_notes:
note[0] += transpose_amt
return all_notes
def _shift_by (arr, k):
'''Shift all elements in an array in a circular manner
Details:
arr: an array
k: number of "shifts"
returns: an array'''
return np.concatenate((arr[-k:], arr[:-k]))
# Helper function for reference of the entire list of canonical functions
def _get_canonical_scale():
arr = np.zeros((12,12))
major_scale_form = np.array([1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1]) / 7
for i in range(12):
key_frequencies = _shift_by(major_scale_form, i)
arr[i] = key_frequencies
return arr
def detect_key(input_frequencies, verbose=False):
'''
Detects which key is implied by the given note frequencies
Details:
input_frequencies: an array containing the frequency of each note (0:C ... 11:B)
returns: an integer 0 - 11 representing the key of the music (0:C ... 11:B)
'''
canonical_scales = _get_canonical_scale()
max_index = -1
max_similarity = -1
for i, scale in enumerate(canonical_scales):
similarity = cosine_similarity([input_frequencies], [scale])
if similarity > max_similarity:
max_index = i
max_similarity = similarity
if verbose:
print("Key: {}".format(max_index))
return max_index
def inferPart(parts):
for i in range(len(parts)):
if parts[i].getInstrument(returnDefault=False).instrumentName in ['Piano right', 'Piano']:
return i
return -1
def print_instruments(parts):
'''Helpful I/O function
Details:
parts: a collection of Parts
returns: None, just prints'''
print('PRINTING PARTS')
for i in range(len(parts)):
print('Part ' + str(i) + ': ' + str(parts[i].getInstrument(returnDefault=False).instrumentName))
def featurize_notes(midi, mode='binary', piano_part=-1, verbose=False):
'''Turns MIDI into a tensor (matrix, in this case)
Details:
midi: a collection of Parts
mode: binary velocity or greyscale velocity (0-127)
piano_part: index of piano Part (-1 if it doesn't need to be specified)
returns: a 2D matrix, like a piano roll'''
# If there's more than one part and we're not told which to use
if len(midi.parts) != 1 and piano_part == -1:
# See if one of the instruments is named Piano
try:
inferred = inferPart(midi.parts)
except:
# Exceptions happen for some reason, usually piano is part 0
piano_part = 0
if verbose:
print('[WARNING]: Guessed 0. Error occurred when parsing parts.')
# No error, use inferred if it found a piano part
else:
if inferred != -1:
piano_part = inferred
else:
piano_part = 0
if verbose:
print('[WARNING]: Guessed 0. Parts shown below:')
print_instruments(midi.parts)
# Only one part
if len(midi.parts) == 1:
elements = midi.parts[0].flat.notes
# Use specified part
else:
elements = midi.parts[piano_part].flat.notes
# Get tensor
return get_tensor_from_part(elements, mode=mode, verbose=verbose)
def get_tensor_from_part(elements, mode, verbose=False):
'''Transform a bunch of elements to a tensor
Details:
elements: a collection of Notes and Chords
mode: binary velocity or greyscale velocity (0-127)
returns: a 2D matrix, like a piano roll'''
# Get notes (including notes from chords)
all_notes = extract_relevant_data(elements, verbose=verbose)
max_len = get_max_length(all_notes)
tensor = np.zeros(shape=(128, max_len))
# Add each note in
for nt in all_notes:
for i in range(nt[2]):
# Include velocity or not
if mode == 'binary':
tensor[nt[0]][nt[1] + i] = 1
elif mode == 'grayscale':
tensor[nt[0]][nt[1] + i] = nt[3] / MAX_VEL
# Bad for model
tensor = remove_big_gaps(tensor)
# Pianos don't have 127 notes, so we remove the 'fake' ones above and below
tensor = cut_non_piano_notes(tensor, inverted=False)
return tensor
def get_max_length(notes):
'''Gets the the length of the song (max of offset + length)
Details:
notes: a list of notes
returns: what the width of the final tensor must be'''
maximum = max(notes, key=lambda x: x[1] + x[2])
return maximum[1] + maximum[2]
def combine_into_rgb(tensor):
'''Combine every three binary digits to an RGB value
Details:
tensor: a 2D matrix (piano roll)
returns: a 3D tensor (height, width, 3 RGB values)'''
shape = tensor.shape
offset = 3 - (shape[1] % 3)
# Need to add column(s) so width is divisible by 3
if offset != 3:
extra = np.zeros(shape=(shape[0], offset))
tensor = np.hstack((tensor, extra))
new_shape = tensor.shape
tensor = np.reshape(tensor, (new_shape[0], int(new_shape[1] / 3), 3))
return tensor
def get_image(tensor, show=False, mode='L'):
'''Turn a tensor into an image
Details:
tensor: a tensor
show: if True, displays the image
mode: 'L' for B/W, 'RGB' for RGB
returns: an image'''
img = Image.fromarray(np.uint8(tensor * 255), mode)
if show:
img.show()
return img
def save_image(image, fname):
'''Save an image
Details:
image: an image
fname: a filename
returns: None'''
image.save(fname)
def convert_midi_to_numpy_image(fname, piano_part=-1, verbose=False, images=True, do_grayscale=False, prefix=''):
'''Top-level method: runs most of the pipeline from filename to numpy and images
Details:
fname: a filename (in INPUT_MIDI_DIR)
piano_part: index of the piano Part (-1 means there's no need to specify)
verbose: if True, shows debug
images: if True, also saves images
returns: None'''
name = fname.split('.')[0]
numpy_dir = prefix + INPUT_NUMPY_DIR + name + '/'
image_dir = prefix + INPUT_IMAGE_DIR + name + '/'
# Create relevant directories
create_dir(numpy_dir, verbose=verbose)
create_dir(image_dir, verbose=verbose)
midi = open_midi(fname, prefix=prefix)
# GET TENSORS
binary_tensor = featurize_notes(midi, mode='binary', piano_part=piano_part, verbose=verbose)
if verbose:
print('Binary tensor shape:', binary_tensor.shape)
binary_rgb_tensor = combine_into_rgb(binary_tensor)
if verbose:
print('Binary compressed (RGB) tensor shape:', binary_rgb_tensor.shape)
if do_grayscale:
grayscale_tensor = featurize_notes(midi, mode='grayscale', piano_part=piano_part, verbose=verbose)
if verbose:
print('Grayscale tensor size:', grayscale_tensor.shape)
grayscale_rgb_tensor = combine_into_rgb(grayscale_tensor)
if verbose:
print('Grayscale compressed (RGB) tensor shape:', grayscale_rgb_tensor.shape)
# GET IMAGES
if images:
binary_image = get_image(binary_tensor, mode='L')
binary_rgb_image = get_image(binary_rgb_tensor, mode='RGB')
if do_grayscale:
grayscale_image = get_image(grayscale_tensor, mode='L')
grayscale_rgb_image = get_image(grayscale_rgb_tensor, mode='RGB')
# SAVE
np.save(numpy_dir + name + '_bin.npy', binary_tensor)
np.save(numpy_dir + name + '_bin_rgb.npy', binary_rgb_tensor)
if do_grayscale:
np.save(numpy_dir + name + '_gray.npy', grayscale_tensor)
np.save(numpy_dir + name + '_gray_rgb.npy', grayscale_rgb_tensor)
if images:
save_image(binary_image, image_dir + name + '_bin.png')
save_image(binary_rgb_image, image_dir + name + '_bin_rgb.png')
if do_grayscale:
save_image(grayscale_image, image_dir + name + '_gray.png')
save_image(grayscale_rgb_image, image_dir + name + '_gray_rgb.png')
def create_dir(dirname, suppress_warnings=False, verbose=False):
'''Create a directory if it doesn't exist
Details:
dirname: a directory name
suppress_warnings: if True, doesn't warn about overriding
returns: None'''
if not os.path.isdir(dirname):
os.mkdir(dirname)
else:
if not suppress_warnings and verbose:
print('NOTE: Directory \'' + dirname + '\' already exists and may be overridden.')
def remove_big_gaps(tensor, threshold=SCALE*4):
'''Remove entire measures of nothing
Details:
tensor: a 2D matrix (piano roll)
threshold: how many pixels of nothing before cutting (defaults to a measure)
returns: a 2D matrix (piano roll)'''
shape = tensor.shape
width = shape[1]
for i in range(width):
if np.all((tensor[:, i:(i + threshold)] == 0)):
tensor = np.hstack((tensor[:, :i], tensor[:, (i+threshold):]))
return tensor
def process_directory(dirname, verbose=False, override=False, prefix=''):
'''High-level method for processing an entire directory of midi files
Details:
dirname: a directory name to crawl through
verbose: if True, prints debug statements
override: if True, doesn't replace existing output folders
returns: None'''
fnames = os.listdir(dirname)
for fname in tqdm(fnames):
name = fname.split('.')[0]
if name == '':
continue
if verbose:
print('\nProcessing ' + name + '...')
if not override and os.path.isdir(prefix + INPUT_NUMPY_DIR + name) and len(os.listdir(prefix + INPUT_NUMPY_DIR + name)) != 0:
if verbose:
print('Skipping \'' + name + '\' because there exists a directory in its name with files inside.')
continue
if name in piano_parts:
piano_part = piano_parts[name]
else:
piano_part = -1
convert_midi_to_numpy_image(fname, verbose=verbose, piano_part=piano_part, prefix=prefix)
def numpy_to_part(tensor, is88=True):
'''Backwards process: convert a tensor to a midi part
Details:
tensor: a 2D matrix (piano roll)
returns: a Part'''
# Create a stream
pt = stream.Part()
count = 0
shape = tensor.shape
for key in range(shape[0]):
for timestep in range(shape[1]):
# Note or continuation of a note
if tensor[key][timestep] != 0:
if timestep == 0 or tensor[key][timestep - 1] == 0:
if is88:
current_note = note.Note(key + 21)
else:
current_note = note.Note(key)
current_note.volume.velocity = tensor[key][timestep] * MAX_VEL
current_note.offset = timestep / SCALE
count += 1
# End of a note
if tensor[key][timestep] == 0 or timestep == shape[1] - 1:
if count != 0:
current_note.duration.quarterLength = count / SCALE
pt.insert(current_note.offset, current_note)
count = 0
return pt
def part_to_midi(part, fname):
'''Turn a part into a MIDI file and save it
Details:
part: a Part
returns: None'''
mf = midi.translate.streamToMidiFile(part)
mf.open(OUTPUT_MIDI_DIR + fname + '.mid', 'wb')
mf.write()
mf.close()
def spread_out_rgb(tensor):
'''Flatten a tensor with RGB values into a 2D B/W matrix (3x as long)
Details:
tensor: a 3D tensor (height, width, 3 RGB values)
returns: a 2D matrix (piano roll)'''
shape = tensor.shape
tensor = np.reshape(tensor, (shape[0], shape[1] * shape[2]))
return tensor
def build_dataset(verbose=False, size=NUM_KEYS, save=False, fname='data'):
'''Concatenate a bunch of files (in the numpy save dir) into one binary dataset
Details:
verbose: if True, prints debug statements
size: height of the piano roll (defaults to 88)
save: save the dataset to file
fname: name of file if saved
returns: a long 2D matrix (piano roll)'''
long_tensor = np.zeros(shape=(size, 1))
for dirname in tqdm(os.listdir(INPUT_NUMPY_DIR)):
long_tensor = np.hstack((long_tensor, load_numpy(dirname + '/' + dirname + '_bin.npy')))
if verbose:
print('Read', len(os.listdir(INPUT_NUMPY_DIR)), 'files and concatenated', long_tensor.shape[1] - 1, 'vectors.')
long_tensor = long_tensor[:, 1:]
if save:
np.save('{}{}.npy'.format(NUMPY_CORPUS_DIR, fname), long_tensor)
return long_tensor
def cut_non_piano_notes(tensor, inverted=False):
'''Remove keys that aren't on a real piano
Note that piano rolls with 0-indexing look inverted by default, but are NOT inverted
Details:
tensor: a 2D matrix (piano roll)
inverted: if True, this means the piano roll has been up-down inverted
returns: a 2D matrix (piano roll)'''
if not inverted:
return tensor[21:109, ...]
else:
return tensor[19:107, ...]
def image_to_np(image, verbose=False, rotate=True):
'''Convert an image back into a numpy array
Details:
image: an image
verbose: if True, shows the image (and other debugs)
rotate: if True, rotates the tensor once counterclockwise
returns: a 2D tensor (piano roll)'''
if verbose:
print(image.format)
print(image.size)
print(image.mode)
image.show()
tensor = np.asarray(image)
if verbose:
print(tensor.shape)
if image.mode == 'RGB':
tensor = flatten_rgb(tensor)
if rotate:
tensor = np.transpose(tensor)
tensor = pad_to_128(tensor, inverted=False)
return tensor
def flatten_rgb(tensor):
'''Ignore color and just make each pixel B/W
Details:
tensor: a 3D tensor (height, width, 3 RGB values)
returns: a 2D matrix (piano roll)'''
shape = tensor.shape
new_tensor = np.zeros(shape=(shape[0], shape[1]))
for i, row in enumerate(tensor):
for j, pixel in enumerate(row):
# Somewhat arbitrary threshold that R, G, and B values are all > half the max pixel strength
if (pixel[0] > MAX_PIXEL_STRENGTH/2) and (pixel[1] > MAX_PIXEL_STRENGTH/2) and (pixel[2] > MAX_PIXEL_STRENGTH/2):
new_tensor[i][j] = 1
else:
new_tensor[i][j] = 0
return new_tensor
def pad_to_128(tensor, inverted=False):
'''Pad to the top and bottom of the array so it's 128 tall again (add non-piano notes back in)
Note that piano rolls with 0-indexing look inverted by default, but are NOT inverted
Details:
tensor: a 2D matrix (piano roll)
inverted: if True, this means the piano roll has been up-down inverted
returns: a 2D matrix (piano roll)'''
bottom = np.zeros(shape=(21, tensor.shape[1]))
top = np.zeros(shape=(19, tensor.shape[1]))
if not inverted:
return np.vstack((bottom, tensor, top))
else:
return np.vstack((top, tensor, bottom))
def save_image88(fname, suffix=0, verbose=False, subdir='', rotate=True):
'''Convert an image (88 pixels tall) to a midi file and save it
Details:
fname: a filename for the input image
suffix: a suffix to append to each saved filename
verbose: if True, shows the image
subdir: a directory name for a folder inside OUTPUT_IMAGE_DIR
rotate: if True, rotates the piano roll once counterclockwise
returns: None'''
image = Image.open(OUTPUT_IMAGE_DIR + subdir + fname)
tensor = image_to_np(image, verbose=verbose, rotate=rotate)
part = numpy_to_part(tensor)
part_to_midi(part, subdir + '/' + 'generated_midi_' + str(suffix))
def plot_history(history):
'''Plots the loss from a keras history
Details:
history: a history dictionary from keras model.fit()
returns: None'''
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
def remove_leading_zeros(arr):
'''Removes rows of all 0s at the beginning of an array
Details:
arr: a numpy array (likely in (timestep, features))
returns: a numpy array'''
cut_amt = -1
for i in range(len(arr)):
if max(arr[i]) == 0:
cut_amt = i
else:
break
return arr[cut_amt+1:]
def get_phrases(piano_roll, min_quarter_notes=2, max_quarter_notes=8, quarter_note_split = 0.25):
'''Converts a piano roll into an array of phrases using the Hartono method
Note that the phrases have 89 features to account for an endsequence bit (bit 88)
TODO: Trash phrases that go over max and wait for a new break?
Details:
piano_roll: a numpy array (likely in (features, timestep))
min_quarter_notes: the minimum number of quarter notes a phrase can be
max_quarter_notes: the maximum number of quarter notes a phrase can be
quarter_note_split: how many quarter notes of rest before calling a new phrase
returns: a numpy array in (phrases, timesteps, features)'''
# (timestep, features)
p_r = np.transpose(piano_roll)
# Count no. of times we hit cutoff
cutoff_counter = 0
# boolean to see what state (cut-off or not) we are in
in_cutoff_state = False
# Pad piano roll to 89
piano_roll = np.zeros((p_r.shape[0], NUM_KEYS + 1))
piano_roll[:p_r.shape[0], :p_r.shape[1]] = p_r
# Amount after which we look for a gap
threshold = SCALE * min_quarter_notes
# Arbitrarily choose an eighth note as enough of a rest
gap_threshold = int(SCALE * quarter_note_split)
# At some point you need to stop
cutoff = SCALE * max_quarter_notes
phrases = np.zeros(shape=(1, cutoff, NUM_KEYS + 1))
# End token
endtoken = np.zeros((1, NUM_KEYS + 1))
endtoken[0, NUM_KEYS] = 1
# Iterate through timesteps
pause_counter = 0
current_phrase = np.zeros((1, NUM_KEYS + 1))
for i in tqdm(range(len(piano_roll))):
# Get vector representing all the notes at this timestep
timestep = piano_roll[i]
# Get highest value (interested in if it's 0)
maximum = max(timestep)
hit_cutoff = len(current_phrase) >= cutoff - 1
if maximum == 0 or hit_cutoff:
pause_counter += 1
# If gap is long enough to make a phrase and phrase iteself is long enough OR hit cutoff
long_break = (pause_counter >= gap_threshold)
# print(long_break, hit_cutoff, in_cutoff_state)
if long_break or hit_cutoff:
if long_break and in_cutoff_state:
in_cutoff_state = False
else:
if len(current_phrase) >= threshold:
# Take out zeros at beginning
current_phrase = remove_leading_zeros(current_phrase)
# If we stopped from a long break, remove that long break
current_phrase = current_phrase[1:-gap_threshold+1]
# Add endtoken
current_phrase = np.concatenate((current_phrase, endtoken), axis=0)
# Pad to cutoff
padded_current_phrase = np.zeros((cutoff, NUM_KEYS + 1))
padded_current_phrase[:current_phrase.shape[0], :current_phrase.shape[1]] = current_phrase
# Add this phrase to running phrases
phrases = np.concatenate((phrases, [padded_current_phrase]), axis=0)
if hit_cutoff:
in_cutoff_state = True
# Reset variables
current_phrase = np.zeros((1, NUM_KEYS + 1))
pause_counter = 0
else:
# Just reset the counter
pause_counter = 0
if not in_cutoff_state:
# Add this timestep to running timesteps
current_phrase = np.concatenate((current_phrase, [timestep]), axis=0)
# print('Fraction at cutoff: ', cutoff_counter / (len(phrases)-1))
return phrases[1:]
def get_melody(path):
"""
Extracts the melody from an array of notes.
Details:
path: path of numpy array to extract the melody from
returns: an array containing only the melody
"""
# lower allowed melody note (middle C)
cutoff_number = 60
# largest allowed melodic jump (one octave)
largest_jump = 12
melody = np.transpose(np.zeros(np.load(path).shape))
arr = np.transpose(np.load(path))
count = 0
for step in arr:
# get the highest note played at the given timestep
high_note = np.max(np.where(step == 1.0)[0]) if 1.0 in step else 0
# create a timestep for the melody containing only the highest note
melody_step = np.zeros((88,))
if high_note != 0:
melody_step[high_note] = 1
if count == 0:
# append highest note to melody only if note is at or above cutoff note
if high_note >= cutoff_number:
melody[count] = melody_step
else:
melody[count] = np.zeros((88,))
count += 1
else:
# get the melody note from the previous timestep
prev_note = np.max(np.where(melody[count-1] == 1.0)[0]) if 1.0 in melody[count-1] else 0
# append highest note to melody if its higher than the previous melody note
if high_note >= prev_note and high_note >= cutoff_number:
melody[count] = melody_step
# append highest note to melody if its an octave or less below the previous melody note
elif prev_note - high_note <= largest_jump and high_note >= cutoff_number:
melody[count] = melody_step
else:
melody[count] = np.zeros((88,))
count += 1
return np.transpose(melody)
def extract_melody(arr, start_threshold=40, difference_threshold=12, quarter_note_reset=2):
# Transpose to (timesteps, keys)
arr = np.transpose(arr)
melody = np.zeros(arr.shape)
# -1 represents no (recent) last melody
last_melody_note = -1
# How long of no melody before resetting last_melody_note
reset_threshold = quarter_note_reset * SCALE
break_counter = 0
for i, step in enumerate(arr):
max_index = np.max(np.where(step == 1)[0]) if 1 in step else -1
# There's at least one note played
if max_index != -1:
# No recent melody played and this note is above the start treshold
start_new_melody = last_melody_note == -1 and max_index > start_threshold
# There is a recent melody and this one is either above it or within a threshold below it
within_threshold = last_melody_note != -1 and last_melody_note - max_index < difference_threshold
if start_new_melody or within_threshold:
melody[i][max_index] = 1
arr[i][max_index] = 0
last_melody_note = max_index
# No new melody note added
else:
break_counter += 1
# Reset last_melody_note (to allow lower starting melodies)
if break_counter >= reset_threshold:
break_counter = 0
last_melody_note = -1
return np.transpose(arr), np.transpose(melody)
def clean_output_numpy(arr, threshold=0.4):
for i in range(len(arr)):
arr[i] = np.array([(lambda x: 1 if (x>threshold) else 0)(x) for x in arr[i]])
return arr[:,:-2]