From 57f7c31b120f6135c31207295372e3b67848126d Mon Sep 17 00:00:00 2001 From: Leigh Smith Date: Wed, 13 Mar 2024 10:09:23 -0400 Subject: [PATCH] Optimise & fix sonification (#355) * Added test for only sonifying those frequency bands which have energy * Moved the summation outside of the frequency loop to hopefully benefit from vector hardware acceleration * Corrected t_max parameter to match units when adjusting intervals, corrected off-by-one comparison of time centers vs. times, further optimised computations in time_frequency * Disabled diagnostics * Removed disabled diagnostics, added clarification to doc strings of the units expected for frequencies and times, reverted version to the already released version, per request * Added suggested optimisations to times and n_times and made truncation of gram when interpolating more explicit and hence meaningful & robust * Simplified to a single call to the interpolator * Removed diagnostics * Changed synthesis optimization threshold to be a function parameter specifying magnitude average, rather than sum. Added rounding of the sample interval computation to avoid precision loss and truncation leading to an off-by-one mismatch between gram and sample intervals. * Renamed variable to match changed metric * Corrected stupid typo * Changed to check that sonification occurs if one element in the frequency band exceeds the threshold --- mir_eval/sonify.py | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/mir_eval/sonify.py b/mir_eval/sonify.py index 0e5d3da8..13a2ea3a 100644 --- a/mir_eval/sonify.py +++ b/mir_eval/sonify.py @@ -61,7 +61,7 @@ def clicks(times, fs, click=None, length=None): def time_frequency(gram, frequencies, times, fs, function=np.sin, length=None, - n_dec=1): + n_dec=1, threshold=0.01): """Reverse synthesis of a time-frequency representation of a signal Parameters @@ -73,11 +73,11 @@ def time_frequency(gram, frequencies, times, fs, function=np.sin, length=None, Non-positive magnitudes are interpreted as silence. frequencies : np.ndarray - array of size ``gram.shape[0]`` denoting the frequency of + array of size ``gram.shape[0]`` denoting the frequency (in Hz) of each row of gram times : np.ndarray, shape= ``(gram.shape[1],)`` or ``(gram.shape[1], 2)`` - Either the start time of each column in the gram, - or the time interval corresponding to each column. + Either the start time (in seconds) of each column in the gram, + or the time interval (in seconds) corresponding to each column. fs : int desired sampling rate of the output signal function : function @@ -88,6 +88,9 @@ def time_frequency(gram, frequencies, times, fs, function=np.sin, length=None, n_dec : int the number of decimals used to approximate each sonfied frequency. Defaults to 1 decimal place. Higher precision will be slower. + threshold : float + optimizes synthesis to only occur for frequencies that have a + linear magnitude of at least one element in gram above the given threshold. Returns ------- @@ -103,11 +106,17 @@ def time_frequency(gram, frequencies, times, fs, function=np.sin, length=None, if length is None: length = int(times[-1, 1] * fs) - times, _ = util.adjust_intervals(times, t_max=length) + last_time_in_secs = float(length) / fs + times, _ = util.adjust_intervals(times, t_max=last_time_in_secs) - # Truncate times so that the shape matches gram - n_times = gram.shape[1] + # Truncate times so that the shape matches gram. However if the time boundaries were converted + # to intervals, then the number of times will be reduced by one, so we only truncate + # if the gram is smaller. + n_times = min(gram.shape[1], times.shape[0]) times = times[:n_times] + # Round up to ensure that the adjusted interval last time does not diverge from length + # due to a loss of precision and truncation to ints. + sample_intervals = np.round(times * fs).astype(int) def _fast_synthesize(frequency): """A faster way to synthesize a signal. @@ -154,27 +163,32 @@ def __interpolator(x): output = np.zeros(length) time_centers = np.mean(times, axis=1) * float(fs) + # Check if there is at least one element on each frequency that has a value above the threshold + # to justify processing, for optimisation. + spectral_max_magnitudes = np.max(gram, axis = 1) for n, frequency in enumerate(frequencies): + if spectral_max_magnitudes[n] < threshold: + continue # Get a waveform of length samples at this frequency wave = _fast_synthesize(frequency) - # Interpolate the values in gram over the time grid + # Interpolate the values in gram over the time grid. if len(time_centers) > 1: + # If times was converted from boundaries to intervals, it will change shape from + # (len, 1) to (len-1, 2), and hence differ from the length of gram (i.e one less), + # so we ensure gram is reduced appropriately. gram_interpolator = interp1d( - time_centers, gram[n, :], + time_centers, gram[n, :n_times], kind='linear', bounds_error=False, fill_value=(gram[n, 0], gram[n, -1])) # If only one time point, create constant interpolator else: gram_interpolator = _const_interpolator(gram[n, 0]) - # Scale each time interval by the piano roll magnitude - for m, (start, end) in enumerate((times * fs).astype(int)): - # Clip the timings to make sure the indices are valid - start, end = max(start, 0), min(end, length) - # add to waveform - output[start:end] += ( - wave[start:end] * gram_interpolator(np.arange(start, end))) + # Create the time-varying scaling for the entire time interval by the piano roll + # magnitude and add to the accumulating waveform. + output += wave[:length] * gram_interpolator(np.arange(max(sample_intervals[0][0], 0), + min(sample_intervals[-1][-1], length))) # Normalize, but only if there's non-zero values norm = np.abs(output).max()