-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstats.py
105 lines (74 loc) · 4.28 KB
/
stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""Statistics used in ML."""
# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement
# ----------------------------------------------------------------------------------------------------------------------------
# LOGGING
# ----------------------------------------------------------------------------------------------------------------------------
import logging
logger = logging.getLogger(__name__)
# ----------------------------------------------------------------------------------------------------------------------------
# Packages
# ----------------------------------------------------------------------------------------------------------------------------
from pyutilz.pythonlib import ensure_installed # lint: disable=ungrouped-imports,disable=wrong-import-order
# ensure_installed("numpy scipy")
# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
# ----------------------------------------------------------------------------------------------------------------------------
from typing import * # pylint: disable=wildcard-import,unused-wildcard-import
import numpy as np
from scipy.stats import norm, t
from scipy.stats._continuous_distns import norm_gen
from functools import lru_cache
def get_dist_percentage_span_for_sd(sd_sigma: float, dist: norm_gen = norm, **dist_kwargs) -> float:
"""Compute percentage of values lying within sigma std deviations from the mean (of a normal distribution).
>>>np.isclose(get_dist_percentage_span_for_sd(3),0.9973002039367398)
True
>>>np.isclose(get_dist_percentage_span_for_sd(3, dist=t, df=1e20), 0.9973002039367398)
True
"""
return 1 - 2 * dist.cdf(-sd_sigma, **dist_kwargs)
def get_sd_for_dist_percentage(dist_percentage: float, dist: norm_gen = norm, **dist_kwargs) -> float:
"""Compute sigma std deviations from the mean where desired percentage of (normally distributed) values lies.
>>>np.isclose(get_sd_for_dist_percentage(0.9973002039367398), 3.0)
True
"""
return -dist.ppf(-(dist_percentage - 1) / 2, **dist_kwargs)
@lru_cache
def get_tukey_fences_multiplier_for_quantile(
quantile: float, sd_sigma: float = 2.7, nonoutlying_dist_percentage: float = None, dist: norm_gen = norm, **dist_kwargs
) -> float:
"""Compute Tukey fences [https://en.wikipedia.org/wiki/John_Tukey] multiplier for a desired quantile or nonoutlying dist coverage percent.
For some nonnegative constant k John Tukey proposed this test, where k=1.5 indicates an "outlier", and k=3 indicates data that is "far out".
Reasoning: https://math.stackexchange.com/questions/966331/why-john-tukey-set-1-5-iqr-to-detect-outliers-instead-of-1-or-2/
>>>np.isclose(get_tukey_fences_multiplier_for_quantile(quantile=0.25, sd_sigma=2.7),1.5015129949825627)
True
>>>np.isclose(get_tukey_fences_multiplier_for_quantile(quantile=0.25, sd_sigma=None, nonoutlying_dist_percentage=0.9930660523939187),1.5015129949825627)
True
>>>np.isclose(get_tukey_fences_multiplier_for_quantile(quantile=0.1, sd_sigma=2.7),0.5534105971977119)
True
"""
assert quantile > 0 and quantile < 1.0
if quantile > 0.5:
quantile = 1 - quantile
if sd_sigma is None:
assert nonoutlying_dist_percentage > 0 and nonoutlying_dist_percentage < 1.0
sd_sigma = get_sd_for_dist_percentage(nonoutlying_dist_percentage, dist=dist, dist_kwargs=dist_kwargs)
ppf = np.abs(dist.ppf(quantile, **dist_kwargs))
return (sd_sigma - ppf) / (2 * ppf)
def get_expected_unique_random_numbers_qty(span_size: int, sample_size: int) -> float:
"""Get expected number of unique elements drawn uniformly with replacement.
https://stats.stackexchange.com/questions/296005/the-expected-number-of-unique-elements-drawn-with-replacement
>>>get_expected_unique_random_numbers_qty(span_size=2000, sample_size=10)
10.0
>>>
get_expected_unique_random_numbers_qty(span_size=200, sample_size=100)
79.0
how to check numerically?
a, b = 0, 0
for _ in range(1000):
values = np.random.randint(0, 200, size=100)
a += len(values)
b += len(np.unique(values))
print(b / a)
0.78861
"""
return np.ceil(span_size * (1 - np.exp(-sample_size / span_size)))