-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
70 changed files
with
226 additions
and
85 deletions.
There are no files selected for viewing
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# LOGGING | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
import logging | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# Normal Imports | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
from typing import * | ||
|
||
import scipy | ||
from scipy import stats | ||
from scipy.stats import norm | ||
import numpy as np, pandas as pd | ||
|
||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# Core | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
|
||
def I(cond: np.ndarray) -> np.ndarray: | ||
# Indicator function | ||
return cond.astype(int) | ||
|
||
|
||
def get_sapp_dataset( | ||
loc: float = 0.0, | ||
scale: float = 9.0, | ||
distr_name: str = "norm", | ||
distr_params: tuple = (), | ||
N: int = 1000, | ||
add_error: bool = False, | ||
random_state: int = 42, | ||
dtype=np.float32, | ||
binarize: bool = True, | ||
) -> Tuple[pd.DataFrame, np.ndarray]: | ||
"""Used in work | ||
Subsemble: an ensemble method for combining subset-specific algorithm fits | ||
Stephanie Sapp, Mark J. van der Laan, and John Canny | ||
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4000126/pdf/nihms-539092.pdf | ||
""" | ||
|
||
np.random.seed(random_state) | ||
|
||
df = pd.DataFrame() | ||
for i in range(20): | ||
df[f"X{i+1}"] = getattr(stats, distr_name).rvs(loc, scale, *distr_params, size=N) | ||
|
||
target = df.eval( | ||
"X1+sin(X2)+log(abs(X3))+X4**2+X5*X6 +@I((X7*X8*X9)<0)+@I(X10>0)+X11*@I(X11>0)+sqrt(abs(X12)) +cos(X13)+2*X14+abs(X15)+@I(X16<-1)+X17*@I(X17<-1)-2*X18-X19*X20" | ||
) | ||
|
||
if add_error: | ||
target += getattr(stats, distr_name).rvs(loc, scale, *distr_params, size=N) | ||
if binarize: | ||
target = (target > target.mean()).astype(np.int8) | ||
return df.astype(dtype), target.astype(dtype) | ||
|
||
|
||
def showcase_pycaret_datasets(): | ||
|
||
from pycaret.datasets import get_data | ||
|
||
df = get_data(verbose=False) | ||
df["# Instances"] = df["# Instances"].astype(np.int32) | ||
return df.sort_values("# Instances").tail(20).reset_index(drop=True) |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import numpy as np | ||
from numba import njit | ||
|
||
|
||
@njit() | ||
def generate_probs_from_outcomes( | ||
outcomes: np.ndarray, chunk_size: int = 20, scale: float = 0.1, nbins: int = 10, bins_std: float = 0.1, flip_percent: float = 0.6 | ||
) -> np.ndarray: | ||
"""Can we generate hypothetical ground truth probs knowing the outcomes in advance? | ||
Our model probs will (hopefully) be calibrated. So, we need synthetic probs to be calibrated, too. With some degree of fitness. | ||
We also need to cover broad range of probs. | ||
So, how to achieve this? | ||
0) if flip_percent is specified, for a random portion of data zeroes and ones are flipped. this will lower ROC AUC. | ||
1) we can work with small random chunks/subsets of data | ||
2) for every chunk, its real freq is computed. | ||
3) for every observation, 'exact' prob is drawn from some distribution (uniform or, say, gaussian) with center in real freq. | ||
then, if bins_std is specified, constant bin noise is applied to all observations of the chunk. | ||
final result is clipped to [0,1] | ||
""" | ||
n = len(outcomes) | ||
indices = np.arange(n) | ||
np.random.shuffle(indices) | ||
|
||
probs = np.empty(n, dtype=np.float32) | ||
bin_offsets = (np.random.random(size=nbins) - 0.5) * bins_std | ||
|
||
if flip_percent: | ||
# flip some bits to worsen our so far perfect predictive power | ||
flip_size = int(n * flip_percent) | ||
if flip_size: | ||
outcomes = outcomes.copy() | ||
flip_indices = np.random.choice(indices, size=flip_size) | ||
outcomes[flip_indices] = 1 - outcomes[flip_indices] | ||
|
||
l = 0 # left border | ||
for idx in range(n // chunk_size): # traverse randomly selected chunks/subsets of original data | ||
r = (idx + 1) * chunk_size # right border | ||
freq = outcomes[l:r].mean() # find real event occuring frequency in current chunk of observation | ||
|
||
# add pregenerated offset for particular bin | ||
bin_idx = int(freq * nbins) | ||
freq = freq + bin_offsets[bin_idx] | ||
|
||
# add small symmetric random noise. it must be higher when freq approaches [0;1] borders. | ||
probs[l:r] = freq + (np.random.random(size=chunk_size) - 0.5) * scale * np.abs(freq - 0.5) | ||
|
||
l = r | ||
|
||
return np.clip(probs, 0.0, 1.0) |
File renamed without changes.
This file was deleted.
Oops, something went wrong.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Oops, something went wrong.