Skip to content

Commit

Permalink
No commit message
Browse files Browse the repository at this point in the history
  • Loading branch information
fingoldo committed Oct 11, 2024
1 parent 79ba8e6 commit 5e21604
Show file tree
Hide file tree
Showing 70 changed files with 226 additions and 85 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
43 changes: 43 additions & 0 deletions mlframe/custom_estimators.py → custom_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,3 +449,46 @@ def clip_to_quantiles_winsor_quantile(arr):

def clip_to_quantiles_hard(arr):
return clip_to_quantiles(arr, quantile=0.01, method="hard")



class IdentityEstimator(BaseEstimator):
"""Just returns some if the existing featurs as-is instead of real learning & predicting.
Good to check via ML metrics decisions of other methods/models.
"""

def __init__(self,feature_names:list=None,feature_indices:list=None):
self.feature_names=feature_names
self.feature_indices=feature_indices

def fit(self, X, y, **fit_params):
if isinstance(self, ClassifierMixin):
if isinstance(y, pd.Series):
self.classes_ = sorted(y.unique())
else:
self.classes_ = sorted(np.unique(y))
return self

def predict(self, X):
if isinstance(X, (pd.DataFrame, pd.Series)):
if self.feature_names:
return X.loc[:, self.feature_names].values
else:
assert self.feature_indices is not None
return X.iloc[:, self.feature_indices].values
else:
assert self.feature_indices is not None
return X[:, self.feature_indices]


class IdentityRegressor(IdentityEstimator, RegressorMixin):
pass


class IdentityClassifier(IdentityEstimator, ClassifierMixin):
def predict_proba(self, X):
last_class_probs = self.predict(X)
if len(self.classes_) == 2 and last_class_probs.ndim==1:
return np.vstack([1 - last_class_probs, last_class_probs]).T
else:
return last_class_probs
72 changes: 72 additions & 0 deletions datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# ----------------------------------------------------------------------------------------------------------------------------
# LOGGING
# ----------------------------------------------------------------------------------------------------------------------------

import logging

logger = logging.getLogger(__name__)


# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
# ----------------------------------------------------------------------------------------------------------------------------

from typing import *

import scipy
from scipy import stats
from scipy.stats import norm
import numpy as np, pandas as pd


# ----------------------------------------------------------------------------------------------------------------------------
# Core
# ----------------------------------------------------------------------------------------------------------------------------


def I(cond: np.ndarray) -> np.ndarray:
# Indicator function
return cond.astype(int)


def get_sapp_dataset(
loc: float = 0.0,
scale: float = 9.0,
distr_name: str = "norm",
distr_params: tuple = (),
N: int = 1000,
add_error: bool = False,
random_state: int = 42,
dtype=np.float32,
binarize: bool = True,
) -> Tuple[pd.DataFrame, np.ndarray]:
"""Used in work
Subsemble: an ensemble method for combining subset-specific algorithm fits
Stephanie Sapp, Mark J. van der Laan, and John Canny
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4000126/pdf/nihms-539092.pdf
"""

np.random.seed(random_state)

df = pd.DataFrame()
for i in range(20):
df[f"X{i+1}"] = getattr(stats, distr_name).rvs(loc, scale, *distr_params, size=N)

target = df.eval(
"X1+sin(X2)+log(abs(X3))+X4**2+X5*X6 +@I((X7*X8*X9)<0)+@I(X10>0)+X11*@I(X11>0)+sqrt(abs(X12)) +cos(X13)+2*X14+abs(X15)+@I(X16<-1)+X17*@I(X17<-1)-2*X18-X19*X20"
)

if add_error:
target += getattr(stats, distr_name).rvs(loc, scale, *distr_params, size=N)
if binarize:
target = (target > target.mean()).astype(np.int8)
return df.astype(dtype), target.astype(dtype)


def showcase_pycaret_datasets():

from pycaret.datasets import get_data

df = get_data(verbose=False)
df["# Instances"] = df["# Instances"].astype(np.int32)
return df.sort_values("# Instances").tail(20).reset_index(drop=True)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -973,7 +973,7 @@ def get_numaggs_names(
)
+ ([] if (directional_only or not return_unsorted_stats) else "nuniques,modmin,modmax,modmean,modqty".split(","))
+ ([] if directional_only else (["q" + str(q) for q in q]))
+ ([] if not return_unsorted_stats else ["ncrs" + str(q) for q in q])
+ ([] if (directional_only or not return_unsorted_stats) else ["ncrs" + str(q) for q in q])
+ get_moments_slope_mi_feature_names(weights=weights, directional_only=directional_only, return_lintrend_approx_stats=return_lintrend_approx_stats)
# + ["mutual_info_regression",]
+ (["hursth", "hurstc"] if return_hurst else [])
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2465,7 +2465,7 @@ def discretize_array(
return quantize_search(arr, bins_edges).astype(dtype) # njitted


@njit(parallel=True)
#@njit(parallel=True)
def discretize_2d_array(
arr: np.ndarray,
n_bins: int = 10,
Expand All @@ -2479,7 +2479,7 @@ def discretize_2d_array(

res = np.empty_like(arr, dtype=dtype)

for col in prange(arr.shape[1]):
for col in tqdmu(arr.shape[1],desc='col',leave=False): # prange
res[:, col] = discretize_array(
arr=arr[:, col],
n_bins=n_bins,
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
30 changes: 0 additions & 30 deletions mlframe/lightninglib.py → lightninglib.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,36 +145,6 @@ def predict_proba(self, X):
return self.predict(X)


class IdentityEstimator(BaseEstimator):
"""Just returns the 1st feature as-is instead of real learning & predicting."""

def fit(self, X, y, **fit_params):
if isinstance(self, ClassifierMixin):
if isinstance(y, pd.Series):
self.classes_ = sorted(y.unique())
else:
self.classes_ = sorted(np.unique(y))
return self

def predict(self, X):
if isinstance(X, (pd.DataFrame, pd.Series)):
X = X.to_numpy()
return X[:, 0]


class IdentityRegressor(IdentityEstimator, RegressorMixin):
pass


class IdentityClassifier(IdentityEstimator, ClassifierMixin):
def predict_proba(self, X):
last_class_probs = self.predict(X)
if len(self.classes_) == 2:
return np.vstack([1 - last_class_probs, last_class_probs]).T
else:
return last_class_probs


# ----------------------------------------------------------------------------------------------------------------------------
# Data
# ----------------------------------------------------------------------------------------------------------------------------
Expand Down
34 changes: 18 additions & 16 deletions mlframe/metrics.py → metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,11 +554,7 @@ def integral_calibration_error_from_metrics(
ICE is a weighted sum of baseline losses-"roc_auc goodness over 0.5".
If roc_auc is not good enough, it incurs additional penalty.
"""
res = (
brier_loss * brier_loss_weight
+ (calibration_mae * mae_weight + calibration_std * std_weight) * np.abs(roc_auc - 0.5)
- np.abs(roc_auc - 0.5) * roc_auc_weight
)
res = brier_loss * brier_loss_weight + calibration_mae * mae_weight + calibration_std * std_weight - np.abs(roc_auc - 0.5) * roc_auc_weight
if np.abs(roc_auc - 0.5) < (min_roc_auc - 0.5):
res += roc_auc_penalty
return res
Expand Down Expand Up @@ -676,11 +672,13 @@ def create_robustness_subgroups(
return subgroups


def create_robustness_subgroups_indices(subgroups: dict, train_idx: np.ndarray, val_idx: np.ndarray, test_idx: np.ndarray, group_weights: dict = {}, cont_nbins: int = 3) -> dict:
def create_robustness_subgroups_indices(
subgroups: dict, train_idx: np.ndarray, val_idx: np.ndarray, test_idx: np.ndarray, group_weights: dict = {}, cont_nbins: int = 3
) -> dict:
res = {}
if len(val_idx)==len(test_idx):
if len(val_idx) == len(test_idx):
logger.warning(f"Validation and test sets have the same size. Robustness subgroups estimation will be incorrect.")
for arr in (train_idx, test_idx,val_idx):
for arr in (train_idx, test_idx, val_idx):
npoints = len(arr)
robustness_subgroups_indices = {}
for group_name, group_params in subgroups.items():
Expand Down Expand Up @@ -841,6 +839,7 @@ def robust_mlperf_metric(
higher_is_better: bool,
subgroups: dict = None,
whole_set_weight: float = 0.5,
min_group_size: int = 100,
) -> float:
"""Bins idices need to be aware of arr sizes: boostings can call the metric on
multiple sets of differnt lengths - train, val, etc. Arrays will be pure numpy, so no other means to
Expand All @@ -859,6 +858,8 @@ def robust_mlperf_metric(

perfs = []
for bin_name, bin_indices in bins.items():
if len(bin_indices) < min_group_size:
continue
if isinstance(y_score, Sequence):
if len(y_score) == 2:
metric_value = metric(y_true[bin_indices], [el[bin_indices] for el in y_score])
Expand All @@ -871,14 +872,15 @@ def robust_mlperf_metric(
metric_value = metric(y_true[bin_indices], y_score[bin_indices])
perfs.append(metric_value)

perfs = np.array(perfs)
bin_metric_value = perfs.mean()
if higher_is_better:
bin_metric_value -= perfs.std()
else:
bin_metric_value += perfs.std()
if perfs:
perfs = np.array(perfs)
bin_metric_value = perfs.mean()
if higher_is_better:
bin_metric_value -= perfs.std()
else:
bin_metric_value += perfs.std()

weights_sum += bin_weight
total_metric_value += bin_metric_value * bin_weight
weights_sum += bin_weight
total_metric_value += bin_metric_value * bin_weight

return total_metric_value / weights_sum
File renamed without changes.
File renamed without changes.
8 changes: 8 additions & 0 deletions mlframe/optimization.py → optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,7 @@ def submit_evaluations(self, candidates: Sequence, evaluations: Sequence, durati
y_label=self.y_label,
expected_fitness_color=self.expected_fitness_color,
legend_location=self.legend_location,
skip_candidates=[0],
)


Expand Down Expand Up @@ -719,6 +720,7 @@ def plot_search_state(
ground_truth: np.ndarray,
known_candidates: np.ndarray,
known_evaluations: np.ndarray,
skip_candidates: Sequence,
acquisition_method: str,
mode: str,
additional_info: str,
Expand Down Expand Up @@ -752,8 +754,14 @@ def plot_search_state(
if y_pred is not None:
axMain.plot(search_space, y_pred, color="red", linestyle="dashed", label="Surrogate Function")
axMain.fill_between(search_space, y_pred - y_std, y_pred + y_std, color="blue", alpha=0.2)

axMain.scatter(known_candidates, known_evaluations, color="blue", label="Known Points")

if skip_candidates:
idx = ~np.isin(known_candidates, skip_candidates)
if idx.sum() > 0:
axMain.set_ylim([known_evaluations[idx].min(), None])

axExpectedFitness.set_yticklabels([])
axExpectedFitness.set_yticks([])
axExpectedFitness.set_ylabel(acquisition_method, color=expected_fitness_color)
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
51 changes: 51 additions & 0 deletions probabilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import numpy as np
from numba import njit


@njit()
def generate_probs_from_outcomes(
outcomes: np.ndarray, chunk_size: int = 20, scale: float = 0.1, nbins: int = 10, bins_std: float = 0.1, flip_percent: float = 0.6
) -> np.ndarray:
"""Can we generate hypothetical ground truth probs knowing the outcomes in advance?
Our model probs will (hopefully) be calibrated. So, we need synthetic probs to be calibrated, too. With some degree of fitness.
We also need to cover broad range of probs.
So, how to achieve this?
0) if flip_percent is specified, for a random portion of data zeroes and ones are flipped. this will lower ROC AUC.
1) we can work with small random chunks/subsets of data
2) for every chunk, its real freq is computed.
3) for every observation, 'exact' prob is drawn from some distribution (uniform or, say, gaussian) with center in real freq.
then, if bins_std is specified, constant bin noise is applied to all observations of the chunk.
final result is clipped to [0,1]
"""
n = len(outcomes)
indices = np.arange(n)
np.random.shuffle(indices)

probs = np.empty(n, dtype=np.float32)
bin_offsets = (np.random.random(size=nbins) - 0.5) * bins_std

if flip_percent:
# flip some bits to worsen our so far perfect predictive power
flip_size = int(n * flip_percent)
if flip_size:
outcomes = outcomes.copy()
flip_indices = np.random.choice(indices, size=flip_size)
outcomes[flip_indices] = 1 - outcomes[flip_indices]

l = 0 # left border
for idx in range(n // chunk_size): # traverse randomly selected chunks/subsets of original data
r = (idx + 1) * chunk_size # right border
freq = outcomes[l:r].mean() # find real event occuring frequency in current chunk of observation

# add pregenerated offset for particular bin
bin_idx = int(freq * nbins)
freq = freq + bin_offsets[bin_idx]

# add small symmetric random noise. it must be higher when freq approaches [0;1] borders.
probs[l:r] = freq + (np.random.random(size=chunk_size) - 0.5) * scale * np.abs(freq - 0.5)

l = r

return np.clip(probs, 0.0, 1.0)
File renamed without changes.
15 changes: 0 additions & 15 deletions pyproject.toml

This file was deleted.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 5e21604

Please sign in to comment.