-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheffsize.py
389 lines (292 loc) · 11.1 KB
/
effsize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
#!/usr/bin/python
# -*-coding: utf-8 -*-
# Author: Joses Ho
# Email : [email protected]
"""
A range of functions to compute various effect sizes.
two_group_difference
cohens_d
hedges_g
cliffs_delta
func_difference
"""
# Obtained from https://github.com/ACCLAB/DABEST-python/blob/master/dabest/_stats_tools/effsize.py
def two_group_difference(control, test, is_paired=False,
effect_size="mean_diff"):
"""
Computes the following metrics for control and test:
- Unstandardized mean difference
- Standardized mean differences (paired or unpaired)
* Cohen's d
* Hedges' g
- Median difference
- Cliff's Delta
See the Wikipedia entry here: https://bit.ly/2LzWokf
Parameters
----------
control, test: list, tuple, or ndarray.
Accepts lists, tuples, or numpy ndarrays of numeric types.
is_paired: boolean, default False.
If True, returns the paired Cohen's d.
effect_size: string, default "mean_diff"
Any one of the following effect sizes:
["mean_diff", "median_diff", "cohens_d", "hedges_g", "cliffs_delta"]
mean_diff: This is simply the mean of `control` subtracted from
the mean of `test`.
cohens_d: This is the mean of control subtracted from the
mean of test, divided by the pooled standard deviation
of control and test. The pooled SD is the square as:
(n1 - 1) * var(control) + (n2 - 1) * var(test)
sqrt ( ------------------------------------------- )
(n1 + n2 - 2)
where n1 and n2 are the sizes of control and test
respectively.
hedges_g: This is Cohen's d corrected for bias via multiplication
with the following correction factor:
gamma(n/2)
J(n) = ------------------------------
sqrt(n/2) * gamma((n - 1) / 2)
where n = (n1 + n2 - 2).
median_diff: This is the median of `control` subtracted from the
median of `test`.
Returns
-------
float: The desired effect size.
"""
import numpy as np
if effect_size == "mean_diff":
return func_difference(control, test, np.mean, is_paired)
elif effect_size == "median_diff":
return func_difference(control, test, np.median, is_paired)
elif effect_size == "cohens_d":
return cohens_d(control, test, is_paired)
elif effect_size == "hedges_g":
return hedges_g(control, test, is_paired)
elif effect_size == "cliffs_delta":
if is_paired is True:
err1 = "`is_paired` is True; therefore Cliff's delta is not defined."
raise ValueError(err1)
else:
return cliffs_delta(control, test)
def func_difference(control, test, func, is_paired):
"""
Applies func to `control` and `test`, and then returns the difference.
Keywords:
--------
control, test: List, tuple, or array.
NaNs are automatically discarded.
func: summary function to apply.
is_paired: boolean.
If True, computes func(test - control).
If False, computes func(test) - func(control).
Returns:
--------
diff: float.
"""
import numpy as np
# Convert to numpy arrays for speed.
# NaNs are automatically dropped.
if control.__class__ != np.ndarray:
control = np.array(control)
if test.__class__ != np.ndarray:
test = np.array(test)
if is_paired:
if len(control) != len(test):
err = "The two arrays supplied do not have the same length."
raise ValueError(err)
control_nan = np.where(np.isnan(control))[0]
test_nan = np.where(np.isnan(test))[0]
indexes_to_drop = np.unique(np.concatenate([control_nan,
test_nan]))
good_indexes = [i for i in range(0, len(control))
if i not in indexes_to_drop]
control = control[good_indexes]
test = test[good_indexes]
return func(test - control)
else:
control = control[~np.isnan(control)]
test = test[~np.isnan(test)]
return func(test) - func(control)
def cohens_d(control, test, is_paired=False):
"""
Computes Cohen's d for test v.s. control.
See https://en.wikipedia.org/wiki/Effect_size#Cohen's_d
Keywords
--------
control, test: List, tuple, or array.
is_paired: boolean, default False
If True, the paired Cohen's d is returned.
Returns
-------
d: float.
If is_paired is False, this is equivalent to:
(numpy.mean(test) - numpy.mean(control)) / pooled StDev
If is_paired is True, returns
(numpy.mean(test) - numpy.mean(control)) / average StDev
The pooled standard deviation is equal to:
(n1 - 1) * var(control) + (n2 - 1) * var (test)
sqrt( ---------------------------------------------- )
(n1 + n2 - 2)
The average standard deviation is equal to:
var(control) + var(test)
sqrt( ------------------------- )
2
Notes
-----
The sample variance (and standard deviation) uses N-1 degrees of freedoms.
This is an application of Bessel's correction, and yields the unbiased
sample variance.
References:
https://en.wikipedia.org/wiki/Bessel%27s_correction
https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation
"""
import numpy as np
# Convert to numpy arrays for speed.
# NaNs are automatically dropped.
if control.__class__ != np.ndarray:
control = np.array(control)
if test.__class__ != np.ndarray:
test = np.array(test)
control = control[~np.isnan(control)]
test = test[~np.isnan(test)]
pooled_sd, average_sd = _compute_standardizers(control, test)
# pooled SD is used for Cohen's d of two independant groups.
# average SD is used for Cohen's d of two paired groups
# (aka repeated measures).
# NOT IMPLEMENTED YET: Correlation adjusted SD is used for Cohen's d of
# two paired groups but accounting for the correlation between
# the two groups.
if is_paired:
# Check control and test are same length.
if len(control) != len(test):
raise ValueError("`control` and `test` are not the same length.")
# assume the two arrays are ordered already.
delta = test - control
M = np.mean(delta)
divisor = average_sd
else:
M = np.mean(test) - np.mean(control)
divisor = pooled_sd
return M / divisor
def hedges_g(control, test, is_paired=False):
"""
Computes Hedges' g for for test v.s. control.
It first computes Cohen's d, then calulates a correction factor based on
the total degress of freedom using the gamma function.
See https://en.wikipedia.org/wiki/Effect_size#Hedges'_g
Keywords
--------
control, test: numeric iterables.
These can be lists, tuples, or arrays of numeric types.
Returns
-------
g: float.
"""
import numpy as np
# Convert to numpy arrays for speed.
# NaNs are automatically dropped.
if control.__class__ != np.ndarray:
control = np.array(control)
if test.__class__ != np.ndarray:
test = np.array(test)
control = control[~np.isnan(control)]
test = test[~np.isnan(test)]
d = cohens_d(control, test, is_paired)
len_c = len(control)
len_t = len(test)
correction_factor = _compute_hedges_correction_factor(len_c, len_t)
return correction_factor * d
def cliffs_delta(control, test):
"""
Computes Cliff's delta for 2 samples.
See https://en.wikipedia.org/wiki/Effect_size#Effect_size_for_ordinal_data
Keywords
--------
control, test: numeric iterables.
These can be lists, tuples, or arrays of numeric types.
Returns
-------
A single numeric float.
"""
import numpy as np
from scipy.stats import mannwhitneyu
# Convert to numpy arrays for speed.
# NaNs are automatically dropped.
if control.__class__ != np.ndarray:
control = np.array(control)
if test.__class__ != np.ndarray:
test = np.array(test)
c = control[~np.isnan(control)]
t = test[~np.isnan(test)]
control_n = len(c)
test_n = len(t)
# Note the order of the control and test arrays.
U, _ = mannwhitneyu(t, c, alternative='two-sided')
cliffs_delta = ((2 * U) / (control_n * test_n)) - 1
# more = 0
# less = 0
#
# for i, c in enumerate(control):
# for j, t in enumerate(test):
# if t > c:
# more += 1
# elif t < c:
# less += 1
#
# cliffs_delta = (more - less) / (control_n * test_n)
return cliffs_delta
def _compute_standardizers(control, test):
from numpy import mean, var, sqrt, nan
# For calculation of correlation; not currently used.
# from scipy.stats import pearsonr
control_n = len(control)
test_n = len(test)
control_mean = mean(control)
test_mean = mean(test)
control_var = var(control, ddof=1) # use N-1 to compute the variance.
test_var = var(test, ddof=1)
control_std = sqrt(control_var)
test_std = sqrt(test_var)
# For unpaired 2-groups standardized mean difference.
pooled = sqrt(((control_n - 1) * control_var + (test_n - 1) * test_var) /
(control_n + test_n - 2)
)
# For paired standardized mean difference.
average = sqrt((control_var + test_var) / 2)
# if len(control) == len(test):
# corr = pearsonr(control, test)[0]
# std_diff = sqrt(control_var + test_var - (2 * corr * control_std * test_std))
# std_diff_corrected = std_diff / (sqrt(2 * (1 - corr)))
# return pooled, average, std_diff_corrected
#
# else:
return pooled, average # indent if you implement above code chunk.
def _compute_hedges_correction_factor(n1, n2):
"""
Computes the bias correction factor for Hedges' g.
See https://en.wikipedia.org/wiki/Effect_size#Hedges'_g
Returns
-------
j: float
References
----------
Larry V. Hedges & Ingram Olkin (1985).
Statistical Methods for Meta-Analysis. Orlando: Academic Press.
ISBN 0-12-336380-2.
"""
from scipy.special import gamma
from numpy import sqrt, isinf
import warnings
df = n1 + n2 - 2
numer = gamma(df / 2)
denom0 = gamma((df - 1) / 2)
denom = sqrt(df / 2) * denom0
if isinf(numer) or isinf(denom):
# occurs when df is too large.
# Apply Hedges and Olkin's approximation.
df_sum = n1 + n2
denom = (4 * df_sum) - 9
out = 1 - (3 / denom)
else:
out = numer / denom
return out