-
Notifications
You must be signed in to change notification settings - Fork 3
/
bin2stats.m
131 lines (110 loc) · 3.84 KB
/
bin2stats.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
function varargout=bin2stats(x,y,multp,perc)
% [xi,ypi,x,y,mult,perc]=BIN2STATS(x,y,multp,perc)
%
% A binned-percentile calculator
%
% INPUT:
%
% x The independent variable, not necessarily unique or equally spaced
% y The same-size dependent variable
% multp The median-x-sampling interval multiplier, the interval over which the
% desired statistics are quoted [default: a fraction of the x-range]
% perc The percentiles [default: 5 50 95]
%
% OUTPUT:
%
% xi New independent variables, midpoints of the requested intervals
% ypi A matrix with columns according to the requested percentiles
% x,y Regurgitated inputs
% mult,perc Regurgitated inputs
%
% EXAMPLE:
%
% bin2stats('demo1')
% bin2stats('demo1',N) % for N a certain integer which randomizes the sample size
%
% SEE ALSO:
%
% ROW2STATS, BINDENS
%
% Last modified by fjsimons-at-alum.mit.edu, 11/10/2023
% Supply some defaults for unit testing
defval('x',rand(randi(10000),1))
% Get to it, boss!
if ~isstr(x)
defval('y',randn(length(x),1));
% Supply some more defaults
defval('multp',round(range(x(:))/30/median(diff(unique(x)))));
defval('perc',[5 50 95]);
% Make sure the input arrays are one-dimensional
x=x(:);
y=y(:);
% First we handle duplicate data points
[xu,IA]=unique(x);
if length(xu)~=length(x)
% Rejig a bit. In BENDERX I used ROW2STATS
[xu,IA]=unique(x+(-1)^randi(2)*rand(length(x),1)*100000*eps);
disp('Have had to rejig')
end
if length(xu)~=length(x)
error('Still not unique')
end
% Reassign data
y=y(IA);
x=xu;
% Then we go MAG2MED, GPS2MEDIAN
beg=min(x);
% Figure out the median x-sampling intervals
newdt=median(diff(x));
intv=multp*newdt;
% How many times will this interval - potentially - be repeated?
ntms=ceil([max(x)-beg]/intv);
% Initialize the medians vector
ypi=nan(ntms,length(perc(:)));
% Snap every value to the nearest increment of newdt
newt=round(x/newdt)*newdt;
% Interpolate the data to the median sampling intervals - explore
% methods! Nearest is most appropriate when noisy data are taken
% sometimes very closest together
yi=interp1(x,y,newt,'nearest');
% This is the average number of values that you'll want to take the
% statistics over, which will end up filling an unequally spaced numbers
% of bins through the reshaped data matrix that you are about to
% construct; watch out for strongly nonhomogeneously distributed data
% sets where you'll want to play with the multp parameter much
multc=floor(length(yi)/multp);
% Compute the desired stats in one go, over the same number of elements
ypi=prctile(reshape(yi(1:multp*multc),multp,multc),perc)';
% But... there's a couple you might have missed, so add their stats also
ypi=[ypi ; prctile(yi(multp*multc+1:end),perc)];
% From this you can learn at which time "ypi" should be quoted
xi=newt([round(multp/2):multp:multp*multc ...
multp*multc+round([length(yi)-multp*multc]/2)])';
% Optional output
varns={xi,ypi,x,y,multp,perc};
varargout=varns(1:nargout);
elseif strcmp(x,'demo1')
% Now the second input is the number,for the demo only
defval('y',[]); N=y; defval('N',10000)
% Some random-size sample of uniform random independent variables
x=rand(randi(N),1);
% Some Gaussian random dependent variables
y=randn(length(x),1);
% Go through the motions
[xi,ypi,x,y,m,p]=bin2stats(x,y);
plot(x,y,'b.')
hold on
pp(1)=plot(xi,ypi(:,2),'k','LineWidth',1);
pp(2)=plot(xi,ypi(:,1),'r','LineWidth',1.5);
pp(3)=plot(xi,ypi(:,3),'r','LineWidth',1.5);
hold off
grid on
longticks(gca,2)
title(sprintf('sample size %i',length(x)))
for index=1:length(p)
pl{index}=sprintf('%ith percentile',p(index));
end
legend(pp,pl)
xlabel('independent variable')
ylabel('dependent variable')
end