-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathstat_util.py
209 lines (190 loc) · 8.86 KB
/
stat_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import numpy as np
from scipy.stats import percentileofscore
def score_ci(
y_true,
y_pred,
score_fun,
sample_weight=None,
n_bootstraps=2000,
confidence_level=0.95,
seed=None,
reject_one_class_samples=True,
):
"""
Compute confidence interval for given score function based on labels and predictions using bootstrapping.
:param y_true: 1D list or array of labels.
:param y_pred: 1D list or array of predictions corresponding to elements in y_true.
:param score_fun: Score function for which confidence interval is computed. (e.g. sklearn.metrics.accuracy_score)
:param sample_weight: 1D list or array of sample weights to pass to score_fun, see e.g. sklearn.metrics.roc_auc_score.
:param n_bootstraps: The number of bootstraps. (default: 2000)
:param confidence_level: Confidence level for computing confidence interval. (default: 0.95)
:param seed: Random seed for reproducibility. (default: None)
:param reject_one_class_samples: Whether to reject bootstrapped samples with only one label. For scores like AUC we
need at least one positive and one negative sample. (default: True)
:return: Score evaluated on labels and predictions, lower confidence interval, upper confidence interval, array of
bootstrapped scores.
"""
assert len(y_true) == len(y_pred)
score = score_fun(y_true, y_pred)
_, ci_lower, ci_upper, scores = score_stat_ci(
y_true=y_true,
y_preds=y_pred,
score_fun=score_fun,
sample_weight=sample_weight,
n_bootstraps=n_bootstraps,
confidence_level=confidence_level,
seed=seed,
reject_one_class_samples=reject_one_class_samples,
)
return score, ci_lower, ci_upper, scores
def score_stat_ci(
y_true,
y_preds,
score_fun,
stat_fun=np.mean,
sample_weight=None,
n_bootstraps=2000,
confidence_level=0.95,
seed=None,
reject_one_class_samples=True,
):
"""
Compute confidence interval for given statistic of a score function based on labels and predictions using
bootstrapping.
:param y_true: 1D list or array of labels.
:param y_preds: A list of lists or 2D array of predictions corresponding to elements in y_true.
:param score_fun: Score function for which confidence interval is computed. (e.g. sklearn.metrics.accuracy_score)
:param stat_fun: Statistic for which confidence interval is computed. (e.g. np.mean)
:param sample_weight: 1D list or array of sample weights to pass to score_fun, see e.g. sklearn.metrics.roc_auc_score.
:param n_bootstraps: The number of bootstraps. (default: 2000)
:param confidence_level: Confidence level for computing confidence interval. (default: 0.95)
:param seed: Random seed for reproducibility. (default: None)
:param reject_one_class_samples: Whether to reject bootstrapped samples with only one label. For scores like AUC we
need at least one positive and one negative sample. (default: True)
:return: Mean score statistic evaluated on labels and predictions, lower confidence interval, upper confidence
interval, array of bootstrapped scores.
"""
y_true = np.array(y_true)
y_preds = np.atleast_2d(y_preds)
assert all(len(y_true) == len(y) for y in y_preds)
np.random.seed(seed)
scores = []
for i in range(n_bootstraps):
readers = np.random.randint(0, len(y_preds), len(y_preds))
indices = np.random.randint(0, len(y_true), len(y_true))
if reject_one_class_samples and len(np.unique(y_true[indices])) < 2:
continue
reader_scores = []
for r in readers:
if sample_weight is not None:
reader_scores.append(score_fun(y_true[indices], y_preds[r][indices], sample_weight=sample_weight[indices]))
else:
reader_scores.append(score_fun(y_true[indices], y_preds[r][indices]))
scores.append(stat_fun(reader_scores))
mean_score = np.mean(scores)
sorted_scores = np.array(sorted(scores))
alpha = (1.0 - confidence_level) / 2.0
ci_lower = sorted_scores[int(round(alpha * len(sorted_scores)))]
ci_upper = sorted_scores[int(round((1.0 - alpha) * len(sorted_scores)))]
return mean_score, ci_lower, ci_upper, scores
def pvalue(
y_true,
y_pred1,
y_pred2,
score_fun,
sample_weight=None,
n_bootstraps=2000,
two_tailed=True,
seed=None,
reject_one_class_samples=True,
):
"""
Compute p-value for hypothesis that score function for model I predictions is higher than for model II predictions
using bootstrapping.
:param y_true: 1D list or array of labels.
:param y_pred1: 1D list or array of predictions for model I corresponding to elements in y_true.
:param y_pred2: 1D list or array of predictions for model II corresponding to elements in y_true.
:param score_fun: Score function for which confidence interval is computed. (e.g. sklearn.metrics.accuracy_score)
:param sample_weight: 1D list or array of sample weights to pass to score_fun, see e.g. sklearn.metrics.roc_auc_score.
:param n_bootstraps: The number of bootstraps. (default: 2000)
:param two_tailed: Whether to use two-tailed test. (default: True)
:param seed: Random seed for reproducibility. (default: None)
:param reject_one_class_samples: Whether to reject bootstrapped samples with only one label. For scores like AUC we
need at least one positive and one negative sample. (default: True)
:return: Computed p-value, array of bootstrapped differences of scores.
"""
assert len(y_true) == len(y_pred1)
assert len(y_true) == len(y_pred2)
return pvalue_stat(
y_true=y_true,
y_preds1=y_pred1,
y_preds2=y_pred2,
score_fun=score_fun,
sample_weight=sample_weight,
n_bootstraps=n_bootstraps,
two_tailed=two_tailed,
seed=seed,
reject_one_class_samples=reject_one_class_samples,
)
def pvalue_stat(
y_true,
y_preds1,
y_preds2,
score_fun,
stat_fun=np.mean,
compare_fun=np.subtract,
sample_weight=None,
n_bootstraps=2000,
two_tailed=True,
seed=None,
reject_one_class_samples=True,
):
"""
Compute p-value for hypothesis that given statistic of score function for model I predictions is higher than for
model II predictions using bootstrapping.
:param y_true: 1D list or array of labels.
:param y_preds1: A list of lists or 2D array of predictions for model I corresponding to elements in y_true.
:param y_preds2: A list of lists or 2D array of predictions for model II corresponding to elements in y_true.
:param score_fun: Score function for which confidence interval is computed. (e.g. sklearn.metrics.accuracy_score)
:param stat_fun: Statistic for which p-value is computed. (e.g. np.mean)
:param compare_fun: Function to determine relative performance. (default: score1 - score2)
:param sample_weight: 1D list or array of sample weights to pass to score_fun, see e.g. sklearn.metrics.roc_auc_score.
:param n_bootstraps: The number of bootstraps. (default: 2000)
:param two_tailed: Whether to use two-tailed test. (default: True)
:param seed: Random seed for reproducibility. (default: None)
:param reject_one_class_samples: Whether to reject bootstrapped samples with only one label. For scores like AUC we
need at least one positive and one negative sample. (default: True)
:return: Computed p-value, array of bootstrapped differences of scores.
"""
y_true = np.array(y_true)
y_preds1 = np.atleast_2d(y_preds1)
y_preds2 = np.atleast_2d(y_preds2)
assert all(len(y_true) == len(y) for y in y_preds1)
assert all(len(y_true) == len(y) for y in y_preds2)
np.random.seed(seed)
z = []
for i in range(n_bootstraps):
readers1 = np.random.randint(0, len(y_preds1), len(y_preds1))
readers2 = np.random.randint(0, len(y_preds2), len(y_preds2))
indices = np.random.randint(0, len(y_true), len(y_true))
if reject_one_class_samples and len(np.unique(y_true[indices])) < 2:
continue
reader1_scores = []
for r in readers1:
if sample_weight is not None:
reader1_scores.append(score_fun(y_true[indices], y_preds1[r][indices], sample_weight=sample_weight[indices]))
else:
reader1_scores.append(score_fun(y_true[indices], y_preds1[r][indices]))
score1 = stat_fun(reader1_scores)
reader2_scores = []
for r in readers2:
if sample_weight is not None:
reader2_scores.append(score_fun(y_true[indices], y_preds2[r][indices], sample_weight=sample_weight[indices]))
else:
reader2_scores.append(score_fun(y_true[indices], y_preds2[r][indices]))
score2 = stat_fun(reader2_scores)
z.append(compare_fun(score1, score2))
p = percentileofscore(z, 0.0, kind="weak") / 100.0
if two_tailed:
p *= 2.0
return p, z