-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpeakPicker.m
84 lines (73 loc) · 3.11 KB
/
peakPicker.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
function [pks_p,pks_w,absDft_f,smoothX_p,pVals] ...
= peakPicker(x_t, Fs, sigma, thresh, fRef, doPlot)
%PEAKPICKER Convert time-domain signal to log-f spectrum, smooth, find peaks.
%
% x_t is the time domain audio audio signal
%
% Fs is its sample rate (these can be got from an audio file by [x_t,Fs] =
% audioread('audiofile.wav');])
%
% sigma is the smoothing width (standard deviation of the pitch-domain Gaussian
% kernel)
%
% fRef is the frequency assigned a value of 0 cents
%
% pks_p are the pitches of peaks (found with findpeaks) of the DFT of x_t after
% it has been translated into the pitch (log-f) domain and smoothed with a
% Gaussian kernel with standard deviation sigma
%
% pks_w are the magnitudes of the peaks with pitches pks_p
%
% absDft_f is the magnitude of the DFT of x_t (frequency domain, unsmoothed).
%
% smoothX_p is the absolute expectation vector (with sigma-width smoothing)
%
% pVals give the pitch values (cents relative to fRef) of xSmooth_p
% By Andrew J. Milne, The MARCS Institute, Western Sydney University.
%% Take the DFT of the input wav
x_t = x_t(:);
nSamples = numel(x_t);
x_f = fft(x_t)/nSamples;
%% Use "sparse sound structure" (SSS) to convert DFT to pitch (log-f) domain
% X = dft2sss(peaks,Fs,{'MP','ESP','CBR','ERBR'},'FRef',fRef);
X = dft2sss(x_f,Fs,'MP','FRef',fRef);
%% Convert to absolute pitch expectation vector
x_p = X.MP; % the pitches (cents relative to fRef) of the entries in x_w
x_w = abs(X.Phasors); % the magnitudes of the pitches in x_p
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
absDft_f = x_w;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Remove almost-zero values below thresh. A sensible thresh value is about
% 60dB below the signal's max because these are likely to be perceptually
% masked (inaudible) and are possible sectral leakage artefacts
x_p(x_w < thresh) = [];
x_w(x_w < thresh) = [];
r = 1;
isRel = 0;
isPer = 0;
limits = 1200*log2([20/fRef 20000/fRef]); % the audible frequency range
isSparse = 0;
kerLen = 12;
% These two lines are not necessary (because out-of-limits values are
% automatically removed by expectationTensor) but removing them here means
% warnings are not displayed.
x_w(x_p < limits(1) | x_p > limits(2)) = []; % remove inaudible frequencies
x_p(x_p < limits(1) | x_p > limits(2)) = []; % remove inaudible frequencies
[smoothX_p,pVals] = expectationTensor(x_p, x_w, sigma, kerLen, ...
r, isRel, isPer, limits, ...
isSparse, 0);
%% Extract the peaks
if doPlot==1
figure()
findpeaks(smoothX_p, pVals, ...
'MinPeakDistance', 0, ...
'MinPeakProminence', 0.002, ...
'Annotate', 'Extents')
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[pks_w, pks_p] = findpeaks(smoothX_p, pVals, ...
'MinPeakDistance', 0, ...
'MinPeakProminence', 0.002, ...
'Annotate', 'Extents');
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
end