From 5e21604159540d5ada2ea38713191b46a504259d Mon Sep 17 00:00:00 2001 From: fingoldo Date: Fri, 11 Oct 2024 18:27:14 +0300 Subject: [PATCH] --- mlframe/Backtesting.py => Backtesting.py | 0 mlframe/Data.py => Data.py | 0 ...ureEngineering.py => FeatureEngineering.py | 0 mlframe/Features.py => Features.py | 0 mlframe/Models.py => Models.py | 0 mlframe/OldEnsembling.py => OldEnsembling.py | 0 mlframe/__init__.py => __init__.py | 0 mlframe/arrays.py => arrays.py | 0 mlframe/baselines.py => baselines.py | 0 mlframe/boruta_shap.py => boruta_shap.py | 0 mlframe/calibration.py => calibration.py | 0 mlframe/cluster.py => cluster.py | 0 mlframe/config.py => config.py | 0 mlframe/core.py => core.py | 0 ...stom_estimators.py => custom_estimators.py | 43 +++++++++++ datasets.py | 72 +++++++++++++++++++ .../early_stopping.py => early_stopping.py | 0 mlframe/eda.py => eda.py | 0 mlframe/ensembling.py => ensembling.py | 0 mlframe/estimators.py => estimators.py | 0 mlframe/evaluation.py => evaluation.py | 0 mlframe/ewma.py => ewma.py | 0 mlframe/experiments.py => experiments.py | 0 .../explainability.py => explainability.py | 0 ...feature_cleaning.py => feature_cleaning.py | 0 .../__init__.py | 0 .../basic.py | 0 .../categorical.py | 0 .../hurst.py | 0 .../numerical.py | 2 +- .../timeseries.py | 0 ...ure_importance.py => feature_importance.py | 0 .../__init__.py | 0 .../filters.py | 4 +- .../wrappers.py | 0 mlframe/helpers.py => helpers.py | 0 mlframe/inference.py => inference.py | 0 mlframe/keras.py => keras.py | 0 mlframe/lightninglib.py => lightninglib.py | 30 -------- mlframe/metrics.py => metrics.py | 34 ++++----- mlframe/mlflowlib.py => mlflowlib.py | 0 .../model_selection.py => model_selection.py | 0 mlframe/optimization.py => optimization.py | 8 +++ mlframe/outliers.py => outliers.py | 0 mlframe/pipelines.py => pipelines.py | 0 mlframe/preprocessing.py => preprocessing.py | 0 probabilities.py | 51 +++++++++++++ ..._suffix_list.dat => public_suffix_list.dat | 0 pyproject.toml | 15 ---- mlframe/scalers.py => scalers.py | 0 mlframe/stats.py => stats.py | 0 mlframe/synthetic.py => synthetic.py | 0 mlframe/tests.py => tests.py | 0 mlframe/text.py => text.py | 0 mlframe/training.py => training.py | 52 ++++++++------ mlframe/tuning.py => tuning.py | 0 .../unittest_arrays.py => unittest_arrays.py | 0 mlframe/utils.py => utils.py | 0 mlframe/version.py => version.py | 0 {mlframe/votenrank => votenrank}/__init__.py | 0 .../data_processing.py | 0 .../fairness_computation.py | 0 {mlframe/votenrank => votenrank}/iia_exp.py | 0 .../leaderboard/Leaderboard.py | 0 .../leaderboard/__init__.py | 0 .../leaderboard/_cw.py | 0 .../leaderboard/_rules.py | 0 .../leaderboard/settings.py | 0 .../votenrank => votenrank}/stability_exp.py | 0 {mlframe/votenrank => votenrank}/utils.py | 0 70 files changed, 226 insertions(+), 85 deletions(-) rename mlframe/Backtesting.py => Backtesting.py (100%) rename mlframe/Data.py => Data.py (100%) rename mlframe/FeatureEngineering.py => FeatureEngineering.py (100%) rename mlframe/Features.py => Features.py (100%) rename mlframe/Models.py => Models.py (100%) rename mlframe/OldEnsembling.py => OldEnsembling.py (100%) rename mlframe/__init__.py => __init__.py (100%) rename mlframe/arrays.py => arrays.py (100%) rename mlframe/baselines.py => baselines.py (100%) rename mlframe/boruta_shap.py => boruta_shap.py (100%) rename mlframe/calibration.py => calibration.py (100%) rename mlframe/cluster.py => cluster.py (100%) rename mlframe/config.py => config.py (100%) rename mlframe/core.py => core.py (100%) rename mlframe/custom_estimators.py => custom_estimators.py (91%) create mode 100644 datasets.py rename mlframe/early_stopping.py => early_stopping.py (100%) rename mlframe/eda.py => eda.py (100%) rename mlframe/ensembling.py => ensembling.py (100%) rename mlframe/estimators.py => estimators.py (100%) rename mlframe/evaluation.py => evaluation.py (100%) rename mlframe/ewma.py => ewma.py (100%) rename mlframe/experiments.py => experiments.py (100%) rename mlframe/explainability.py => explainability.py (100%) rename mlframe/feature_cleaning.py => feature_cleaning.py (100%) rename {mlframe/feature_engineering => feature_engineering}/__init__.py (100%) rename {mlframe/feature_engineering => feature_engineering}/basic.py (100%) rename {mlframe/feature_engineering => feature_engineering}/categorical.py (100%) rename {mlframe/feature_engineering => feature_engineering}/hurst.py (100%) rename {mlframe/feature_engineering => feature_engineering}/numerical.py (99%) rename {mlframe/feature_engineering => feature_engineering}/timeseries.py (100%) rename mlframe/feature_importance.py => feature_importance.py (100%) rename {mlframe/feature_selection => feature_selection}/__init__.py (100%) rename {mlframe/feature_selection => feature_selection}/filters.py (99%) rename {mlframe/feature_selection => feature_selection}/wrappers.py (100%) rename mlframe/helpers.py => helpers.py (100%) rename mlframe/inference.py => inference.py (100%) rename mlframe/keras.py => keras.py (100%) rename mlframe/lightninglib.py => lightninglib.py (95%) rename mlframe/metrics.py => metrics.py (97%) rename mlframe/mlflowlib.py => mlflowlib.py (100%) rename mlframe/model_selection.py => model_selection.py (100%) rename mlframe/optimization.py => optimization.py (99%) rename mlframe/outliers.py => outliers.py (100%) rename mlframe/pipelines.py => pipelines.py (100%) rename mlframe/preprocessing.py => preprocessing.py (100%) create mode 100644 probabilities.py rename mlframe/public_suffix_list.dat => public_suffix_list.dat (100%) delete mode 100644 pyproject.toml rename mlframe/scalers.py => scalers.py (100%) rename mlframe/stats.py => stats.py (100%) rename mlframe/synthetic.py => synthetic.py (100%) rename mlframe/tests.py => tests.py (100%) rename mlframe/text.py => text.py (100%) rename mlframe/training.py => training.py (96%) rename mlframe/tuning.py => tuning.py (100%) rename mlframe/unittest_arrays.py => unittest_arrays.py (100%) rename mlframe/utils.py => utils.py (100%) rename mlframe/version.py => version.py (100%) rename {mlframe/votenrank => votenrank}/__init__.py (100%) rename {mlframe/votenrank => votenrank}/data_processing.py (100%) rename {mlframe/votenrank => votenrank}/fairness_computation.py (100%) rename {mlframe/votenrank => votenrank}/iia_exp.py (100%) rename {mlframe/votenrank => votenrank}/leaderboard/Leaderboard.py (100%) rename {mlframe/votenrank => votenrank}/leaderboard/__init__.py (100%) rename {mlframe/votenrank => votenrank}/leaderboard/_cw.py (100%) rename {mlframe/votenrank => votenrank}/leaderboard/_rules.py (100%) rename {mlframe/votenrank => votenrank}/leaderboard/settings.py (100%) rename {mlframe/votenrank => votenrank}/stability_exp.py (100%) rename {mlframe/votenrank => votenrank}/utils.py (100%) diff --git a/mlframe/Backtesting.py b/Backtesting.py similarity index 100% rename from mlframe/Backtesting.py rename to Backtesting.py diff --git a/mlframe/Data.py b/Data.py similarity index 100% rename from mlframe/Data.py rename to Data.py diff --git a/mlframe/FeatureEngineering.py b/FeatureEngineering.py similarity index 100% rename from mlframe/FeatureEngineering.py rename to FeatureEngineering.py diff --git a/mlframe/Features.py b/Features.py similarity index 100% rename from mlframe/Features.py rename to Features.py diff --git a/mlframe/Models.py b/Models.py similarity index 100% rename from mlframe/Models.py rename to Models.py diff --git a/mlframe/OldEnsembling.py b/OldEnsembling.py similarity index 100% rename from mlframe/OldEnsembling.py rename to OldEnsembling.py diff --git a/mlframe/__init__.py b/__init__.py similarity index 100% rename from mlframe/__init__.py rename to __init__.py diff --git a/mlframe/arrays.py b/arrays.py similarity index 100% rename from mlframe/arrays.py rename to arrays.py diff --git a/mlframe/baselines.py b/baselines.py similarity index 100% rename from mlframe/baselines.py rename to baselines.py diff --git a/mlframe/boruta_shap.py b/boruta_shap.py similarity index 100% rename from mlframe/boruta_shap.py rename to boruta_shap.py diff --git a/mlframe/calibration.py b/calibration.py similarity index 100% rename from mlframe/calibration.py rename to calibration.py diff --git a/mlframe/cluster.py b/cluster.py similarity index 100% rename from mlframe/cluster.py rename to cluster.py diff --git a/mlframe/config.py b/config.py similarity index 100% rename from mlframe/config.py rename to config.py diff --git a/mlframe/core.py b/core.py similarity index 100% rename from mlframe/core.py rename to core.py diff --git a/mlframe/custom_estimators.py b/custom_estimators.py similarity index 91% rename from mlframe/custom_estimators.py rename to custom_estimators.py index dfeeaf5..b3577e3 100644 --- a/mlframe/custom_estimators.py +++ b/custom_estimators.py @@ -449,3 +449,46 @@ def clip_to_quantiles_winsor_quantile(arr): def clip_to_quantiles_hard(arr): return clip_to_quantiles(arr, quantile=0.01, method="hard") + + + +class IdentityEstimator(BaseEstimator): + """Just returns some if the existing featurs as-is instead of real learning & predicting. + Good to check via ML metrics decisions of other methods/models. + """ + + def __init__(self,feature_names:list=None,feature_indices:list=None): + self.feature_names=feature_names + self.feature_indices=feature_indices + + def fit(self, X, y, **fit_params): + if isinstance(self, ClassifierMixin): + if isinstance(y, pd.Series): + self.classes_ = sorted(y.unique()) + else: + self.classes_ = sorted(np.unique(y)) + return self + + def predict(self, X): + if isinstance(X, (pd.DataFrame, pd.Series)): + if self.feature_names: + return X.loc[:, self.feature_names].values + else: + assert self.feature_indices is not None + return X.iloc[:, self.feature_indices].values + else: + assert self.feature_indices is not None + return X[:, self.feature_indices] + + +class IdentityRegressor(IdentityEstimator, RegressorMixin): + pass + + +class IdentityClassifier(IdentityEstimator, ClassifierMixin): + def predict_proba(self, X): + last_class_probs = self.predict(X) + if len(self.classes_) == 2 and last_class_probs.ndim==1: + return np.vstack([1 - last_class_probs, last_class_probs]).T + else: + return last_class_probs diff --git a/datasets.py b/datasets.py new file mode 100644 index 0000000..c10830f --- /dev/null +++ b/datasets.py @@ -0,0 +1,72 @@ +# ---------------------------------------------------------------------------------------------------------------------------- +# LOGGING +# ---------------------------------------------------------------------------------------------------------------------------- + +import logging + +logger = logging.getLogger(__name__) + + +# ---------------------------------------------------------------------------------------------------------------------------- +# Normal Imports +# ---------------------------------------------------------------------------------------------------------------------------- + +from typing import * + +import scipy +from scipy import stats +from scipy.stats import norm +import numpy as np, pandas as pd + + +# ---------------------------------------------------------------------------------------------------------------------------- +# Core +# ---------------------------------------------------------------------------------------------------------------------------- + + +def I(cond: np.ndarray) -> np.ndarray: + # Indicator function + return cond.astype(int) + + +def get_sapp_dataset( + loc: float = 0.0, + scale: float = 9.0, + distr_name: str = "norm", + distr_params: tuple = (), + N: int = 1000, + add_error: bool = False, + random_state: int = 42, + dtype=np.float32, + binarize: bool = True, +) -> Tuple[pd.DataFrame, np.ndarray]: + """Used in work + Subsemble: an ensemble method for combining subset-specific algorithm fits + Stephanie Sapp, Mark J. van der Laan, and John Canny + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4000126/pdf/nihms-539092.pdf + """ + + np.random.seed(random_state) + + df = pd.DataFrame() + for i in range(20): + df[f"X{i+1}"] = getattr(stats, distr_name).rvs(loc, scale, *distr_params, size=N) + + target = df.eval( + "X1+sin(X2)+log(abs(X3))+X4**2+X5*X6 +@I((X7*X8*X9)<0)+@I(X10>0)+X11*@I(X11>0)+sqrt(abs(X12)) +cos(X13)+2*X14+abs(X15)+@I(X16<-1)+X17*@I(X17<-1)-2*X18-X19*X20" + ) + + if add_error: + target += getattr(stats, distr_name).rvs(loc, scale, *distr_params, size=N) + if binarize: + target = (target > target.mean()).astype(np.int8) + return df.astype(dtype), target.astype(dtype) + + +def showcase_pycaret_datasets(): + + from pycaret.datasets import get_data + + df = get_data(verbose=False) + df["# Instances"] = df["# Instances"].astype(np.int32) + return df.sort_values("# Instances").tail(20).reset_index(drop=True) diff --git a/mlframe/early_stopping.py b/early_stopping.py similarity index 100% rename from mlframe/early_stopping.py rename to early_stopping.py diff --git a/mlframe/eda.py b/eda.py similarity index 100% rename from mlframe/eda.py rename to eda.py diff --git a/mlframe/ensembling.py b/ensembling.py similarity index 100% rename from mlframe/ensembling.py rename to ensembling.py diff --git a/mlframe/estimators.py b/estimators.py similarity index 100% rename from mlframe/estimators.py rename to estimators.py diff --git a/mlframe/evaluation.py b/evaluation.py similarity index 100% rename from mlframe/evaluation.py rename to evaluation.py diff --git a/mlframe/ewma.py b/ewma.py similarity index 100% rename from mlframe/ewma.py rename to ewma.py diff --git a/mlframe/experiments.py b/experiments.py similarity index 100% rename from mlframe/experiments.py rename to experiments.py diff --git a/mlframe/explainability.py b/explainability.py similarity index 100% rename from mlframe/explainability.py rename to explainability.py diff --git a/mlframe/feature_cleaning.py b/feature_cleaning.py similarity index 100% rename from mlframe/feature_cleaning.py rename to feature_cleaning.py diff --git a/mlframe/feature_engineering/__init__.py b/feature_engineering/__init__.py similarity index 100% rename from mlframe/feature_engineering/__init__.py rename to feature_engineering/__init__.py diff --git a/mlframe/feature_engineering/basic.py b/feature_engineering/basic.py similarity index 100% rename from mlframe/feature_engineering/basic.py rename to feature_engineering/basic.py diff --git a/mlframe/feature_engineering/categorical.py b/feature_engineering/categorical.py similarity index 100% rename from mlframe/feature_engineering/categorical.py rename to feature_engineering/categorical.py diff --git a/mlframe/feature_engineering/hurst.py b/feature_engineering/hurst.py similarity index 100% rename from mlframe/feature_engineering/hurst.py rename to feature_engineering/hurst.py diff --git a/mlframe/feature_engineering/numerical.py b/feature_engineering/numerical.py similarity index 99% rename from mlframe/feature_engineering/numerical.py rename to feature_engineering/numerical.py index e2409ff..370c11f 100644 --- a/mlframe/feature_engineering/numerical.py +++ b/feature_engineering/numerical.py @@ -973,7 +973,7 @@ def get_numaggs_names( ) + ([] if (directional_only or not return_unsorted_stats) else "nuniques,modmin,modmax,modmean,modqty".split(",")) + ([] if directional_only else (["q" + str(q) for q in q])) - + ([] if not return_unsorted_stats else ["ncrs" + str(q) for q in q]) + + ([] if (directional_only or not return_unsorted_stats) else ["ncrs" + str(q) for q in q]) + get_moments_slope_mi_feature_names(weights=weights, directional_only=directional_only, return_lintrend_approx_stats=return_lintrend_approx_stats) # + ["mutual_info_regression",] + (["hursth", "hurstc"] if return_hurst else []) diff --git a/mlframe/feature_engineering/timeseries.py b/feature_engineering/timeseries.py similarity index 100% rename from mlframe/feature_engineering/timeseries.py rename to feature_engineering/timeseries.py diff --git a/mlframe/feature_importance.py b/feature_importance.py similarity index 100% rename from mlframe/feature_importance.py rename to feature_importance.py diff --git a/mlframe/feature_selection/__init__.py b/feature_selection/__init__.py similarity index 100% rename from mlframe/feature_selection/__init__.py rename to feature_selection/__init__.py diff --git a/mlframe/feature_selection/filters.py b/feature_selection/filters.py similarity index 99% rename from mlframe/feature_selection/filters.py rename to feature_selection/filters.py index 5216d39..ce90429 100644 --- a/mlframe/feature_selection/filters.py +++ b/feature_selection/filters.py @@ -2465,7 +2465,7 @@ def discretize_array( return quantize_search(arr, bins_edges).astype(dtype) # njitted -@njit(parallel=True) +#@njit(parallel=True) def discretize_2d_array( arr: np.ndarray, n_bins: int = 10, @@ -2479,7 +2479,7 @@ def discretize_2d_array( res = np.empty_like(arr, dtype=dtype) - for col in prange(arr.shape[1]): + for col in tqdmu(arr.shape[1],desc='col',leave=False): # prange res[:, col] = discretize_array( arr=arr[:, col], n_bins=n_bins, diff --git a/mlframe/feature_selection/wrappers.py b/feature_selection/wrappers.py similarity index 100% rename from mlframe/feature_selection/wrappers.py rename to feature_selection/wrappers.py diff --git a/mlframe/helpers.py b/helpers.py similarity index 100% rename from mlframe/helpers.py rename to helpers.py diff --git a/mlframe/inference.py b/inference.py similarity index 100% rename from mlframe/inference.py rename to inference.py diff --git a/mlframe/keras.py b/keras.py similarity index 100% rename from mlframe/keras.py rename to keras.py diff --git a/mlframe/lightninglib.py b/lightninglib.py similarity index 95% rename from mlframe/lightninglib.py rename to lightninglib.py index a287882..c5eeee5 100644 --- a/mlframe/lightninglib.py +++ b/lightninglib.py @@ -145,36 +145,6 @@ def predict_proba(self, X): return self.predict(X) -class IdentityEstimator(BaseEstimator): - """Just returns the 1st feature as-is instead of real learning & predicting.""" - - def fit(self, X, y, **fit_params): - if isinstance(self, ClassifierMixin): - if isinstance(y, pd.Series): - self.classes_ = sorted(y.unique()) - else: - self.classes_ = sorted(np.unique(y)) - return self - - def predict(self, X): - if isinstance(X, (pd.DataFrame, pd.Series)): - X = X.to_numpy() - return X[:, 0] - - -class IdentityRegressor(IdentityEstimator, RegressorMixin): - pass - - -class IdentityClassifier(IdentityEstimator, ClassifierMixin): - def predict_proba(self, X): - last_class_probs = self.predict(X) - if len(self.classes_) == 2: - return np.vstack([1 - last_class_probs, last_class_probs]).T - else: - return last_class_probs - - # ---------------------------------------------------------------------------------------------------------------------------- # Data # ---------------------------------------------------------------------------------------------------------------------------- diff --git a/mlframe/metrics.py b/metrics.py similarity index 97% rename from mlframe/metrics.py rename to metrics.py index eef7f68..5db1c6b 100644 --- a/mlframe/metrics.py +++ b/metrics.py @@ -554,11 +554,7 @@ def integral_calibration_error_from_metrics( ICE is a weighted sum of baseline losses-"roc_auc goodness over 0.5". If roc_auc is not good enough, it incurs additional penalty. """ - res = ( - brier_loss * brier_loss_weight - + (calibration_mae * mae_weight + calibration_std * std_weight) * np.abs(roc_auc - 0.5) - - np.abs(roc_auc - 0.5) * roc_auc_weight - ) + res = brier_loss * brier_loss_weight + calibration_mae * mae_weight + calibration_std * std_weight - np.abs(roc_auc - 0.5) * roc_auc_weight if np.abs(roc_auc - 0.5) < (min_roc_auc - 0.5): res += roc_auc_penalty return res @@ -676,11 +672,13 @@ def create_robustness_subgroups( return subgroups -def create_robustness_subgroups_indices(subgroups: dict, train_idx: np.ndarray, val_idx: np.ndarray, test_idx: np.ndarray, group_weights: dict = {}, cont_nbins: int = 3) -> dict: +def create_robustness_subgroups_indices( + subgroups: dict, train_idx: np.ndarray, val_idx: np.ndarray, test_idx: np.ndarray, group_weights: dict = {}, cont_nbins: int = 3 +) -> dict: res = {} - if len(val_idx)==len(test_idx): + if len(val_idx) == len(test_idx): logger.warning(f"Validation and test sets have the same size. Robustness subgroups estimation will be incorrect.") - for arr in (train_idx, test_idx,val_idx): + for arr in (train_idx, test_idx, val_idx): npoints = len(arr) robustness_subgroups_indices = {} for group_name, group_params in subgroups.items(): @@ -841,6 +839,7 @@ def robust_mlperf_metric( higher_is_better: bool, subgroups: dict = None, whole_set_weight: float = 0.5, + min_group_size: int = 100, ) -> float: """Bins idices need to be aware of arr sizes: boostings can call the metric on multiple sets of differnt lengths - train, val, etc. Arrays will be pure numpy, so no other means to @@ -859,6 +858,8 @@ def robust_mlperf_metric( perfs = [] for bin_name, bin_indices in bins.items(): + if len(bin_indices) < min_group_size: + continue if isinstance(y_score, Sequence): if len(y_score) == 2: metric_value = metric(y_true[bin_indices], [el[bin_indices] for el in y_score]) @@ -871,14 +872,15 @@ def robust_mlperf_metric( metric_value = metric(y_true[bin_indices], y_score[bin_indices]) perfs.append(metric_value) - perfs = np.array(perfs) - bin_metric_value = perfs.mean() - if higher_is_better: - bin_metric_value -= perfs.std() - else: - bin_metric_value += perfs.std() + if perfs: + perfs = np.array(perfs) + bin_metric_value = perfs.mean() + if higher_is_better: + bin_metric_value -= perfs.std() + else: + bin_metric_value += perfs.std() - weights_sum += bin_weight - total_metric_value += bin_metric_value * bin_weight + weights_sum += bin_weight + total_metric_value += bin_metric_value * bin_weight return total_metric_value / weights_sum diff --git a/mlframe/mlflowlib.py b/mlflowlib.py similarity index 100% rename from mlframe/mlflowlib.py rename to mlflowlib.py diff --git a/mlframe/model_selection.py b/model_selection.py similarity index 100% rename from mlframe/model_selection.py rename to model_selection.py diff --git a/mlframe/optimization.py b/optimization.py similarity index 99% rename from mlframe/optimization.py rename to optimization.py index 606a9b4..75066e0 100644 --- a/mlframe/optimization.py +++ b/optimization.py @@ -521,6 +521,7 @@ def submit_evaluations(self, candidates: Sequence, evaluations: Sequence, durati y_label=self.y_label, expected_fitness_color=self.expected_fitness_color, legend_location=self.legend_location, + skip_candidates=[0], ) @@ -719,6 +720,7 @@ def plot_search_state( ground_truth: np.ndarray, known_candidates: np.ndarray, known_evaluations: np.ndarray, + skip_candidates: Sequence, acquisition_method: str, mode: str, additional_info: str, @@ -752,8 +754,14 @@ def plot_search_state( if y_pred is not None: axMain.plot(search_space, y_pred, color="red", linestyle="dashed", label="Surrogate Function") axMain.fill_between(search_space, y_pred - y_std, y_pred + y_std, color="blue", alpha=0.2) + axMain.scatter(known_candidates, known_evaluations, color="blue", label="Known Points") + if skip_candidates: + idx = ~np.isin(known_candidates, skip_candidates) + if idx.sum() > 0: + axMain.set_ylim([known_evaluations[idx].min(), None]) + axExpectedFitness.set_yticklabels([]) axExpectedFitness.set_yticks([]) axExpectedFitness.set_ylabel(acquisition_method, color=expected_fitness_color) diff --git a/mlframe/outliers.py b/outliers.py similarity index 100% rename from mlframe/outliers.py rename to outliers.py diff --git a/mlframe/pipelines.py b/pipelines.py similarity index 100% rename from mlframe/pipelines.py rename to pipelines.py diff --git a/mlframe/preprocessing.py b/preprocessing.py similarity index 100% rename from mlframe/preprocessing.py rename to preprocessing.py diff --git a/probabilities.py b/probabilities.py new file mode 100644 index 0000000..2b10f76 --- /dev/null +++ b/probabilities.py @@ -0,0 +1,51 @@ +import numpy as np +from numba import njit + + +@njit() +def generate_probs_from_outcomes( + outcomes: np.ndarray, chunk_size: int = 20, scale: float = 0.1, nbins: int = 10, bins_std: float = 0.1, flip_percent: float = 0.6 +) -> np.ndarray: + """Can we generate hypothetical ground truth probs knowing the outcomes in advance? + Our model probs will (hopefully) be calibrated. So, we need synthetic probs to be calibrated, too. With some degree of fitness. + We also need to cover broad range of probs. + So, how to achieve this? + + 0) if flip_percent is specified, for a random portion of data zeroes and ones are flipped. this will lower ROC AUC. + 1) we can work with small random chunks/subsets of data + 2) for every chunk, its real freq is computed. + 3) for every observation, 'exact' prob is drawn from some distribution (uniform or, say, gaussian) with center in real freq. + then, if bins_std is specified, constant bin noise is applied to all observations of the chunk. + + final result is clipped to [0,1] + """ + n = len(outcomes) + indices = np.arange(n) + np.random.shuffle(indices) + + probs = np.empty(n, dtype=np.float32) + bin_offsets = (np.random.random(size=nbins) - 0.5) * bins_std + + if flip_percent: + # flip some bits to worsen our so far perfect predictive power + flip_size = int(n * flip_percent) + if flip_size: + outcomes = outcomes.copy() + flip_indices = np.random.choice(indices, size=flip_size) + outcomes[flip_indices] = 1 - outcomes[flip_indices] + + l = 0 # left border + for idx in range(n // chunk_size): # traverse randomly selected chunks/subsets of original data + r = (idx + 1) * chunk_size # right border + freq = outcomes[l:r].mean() # find real event occuring frequency in current chunk of observation + + # add pregenerated offset for particular bin + bin_idx = int(freq * nbins) + freq = freq + bin_offsets[bin_idx] + + # add small symmetric random noise. it must be higher when freq approaches [0;1] borders. + probs[l:r] = freq + (np.random.random(size=chunk_size) - 0.5) * scale * np.abs(freq - 0.5) + + l = r + + return np.clip(probs, 0.0, 1.0) diff --git a/mlframe/public_suffix_list.dat b/public_suffix_list.dat similarity index 100% rename from mlframe/public_suffix_list.dat rename to public_suffix_list.dat diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 8e9c236..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,15 +0,0 @@ -[tool.poetry] -name = "mlframe" -version = "0.1.0" -description = "tools for machine learning in python" -authors = ["Your Name "] -license = "MIT" - -[tool.poetry.dependencies] -python = "^3.11" - -[tool.poetry.dev-dependencies] - -[build-system] -requires = ["poetry>=0.12"] -build-backend = "poetry.masonry.api" diff --git a/mlframe/scalers.py b/scalers.py similarity index 100% rename from mlframe/scalers.py rename to scalers.py diff --git a/mlframe/stats.py b/stats.py similarity index 100% rename from mlframe/stats.py rename to stats.py diff --git a/mlframe/synthetic.py b/synthetic.py similarity index 100% rename from mlframe/synthetic.py rename to synthetic.py diff --git a/mlframe/tests.py b/tests.py similarity index 100% rename from mlframe/tests.py rename to tests.py diff --git a/mlframe/text.py b/text.py similarity index 100% rename from mlframe/text.py rename to text.py diff --git a/mlframe/training.py b/training.py similarity index 96% rename from mlframe/training.py rename to training.py index fb9ec4d..f7178da 100644 --- a/mlframe/training.py +++ b/training.py @@ -24,6 +24,7 @@ import copy import joblib import psutil +import inspect from gc import collect from functools import partial from os.path import join, exists @@ -85,6 +86,15 @@ def root_mean_squared_error( from mlframe.metrics import fast_roc_auc, fast_calibration_report, compute_probabilistic_multiclass_error, CB_EVAL_METRIC from mlframe.metrics import create_robustness_subgroups,create_robustness_subgroups_indices,compute_robustness_metrics,robust_mlperf_metric +# ---------------------------------------------------------------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------------------------------------------------------------- + + +def get_function_param_names(func): + signature = inspect.signature(func) + return list(signature.parameters.keys()) + # ---------------------------------------------------------------------------------------------------------------------------- # Inits # ---------------------------------------------------------------------------------------------------------------------------- @@ -381,6 +391,7 @@ def train_and_evaluate_model( """ collect() + best_iter=None if not custom_ice_metric: custom_ice_metric = compute_probabilistic_multiclass_error @@ -428,12 +439,10 @@ def train_and_evaluate_model( if val_idx is not None: # insert eval_set where needed - if model_type_name in XGBOOST_MODEL_TYPES: - fit_params["eval_set"] = ((val_df, target.loc[val_idx]),) - elif model_type_name in LGBM_MODEL_TYPES: + if model_type_name in LGBM_MODEL_TYPES: fit_params["eval_set"] = (val_df, target.loc[val_idx]) # fit_params["callbacks"] = [lgb.early_stopping(stopping_rounds=early_stopping_rounds)] - elif model_type_name in CATBOOST_MODEL_TYPES: + elif model_type_name in CATBOOST_MODEL_TYPES or model_type_name in XGBOOST_MODEL_TYPES: fit_params["eval_set"] = [ (val_df, target.loc[val_idx]), ] @@ -448,13 +457,11 @@ def train_and_evaluate_model( if "cat_features" in fit_params: fit_params["cat_features"] = [col for col in fit_params["cat_features"] if col in train_df.columns] - if fit_params and isinstance(model, Pipeline): - fit_params = prefix_dict_elems(fit_params, "est__") - if model is not None: if (not use_cache) or (not exists(model_file_name)): if sample_weight is not None: - sample_weight = sample_weight.loc[train_idx].values + if "sample_weight" in get_function_param_names(model_obj.fit): + fit_params["sample_weight"] = sample_weight.loc[train_idx].values if verbose: logger.info(f"{model_name} training dataset shape: {train_df.shape}") if display_sample_size: display(train_df.head(display_sample_size).style.set_caption(f"{model_name} features head")) @@ -470,7 +477,10 @@ def train_and_evaluate_model( if model_type_name in TABNET_MODEL_TYPES: train_df=train_df.values - model.fit(train_df, target.loc[train_idx], sample_weight=sample_weight, **fit_params) + if fit_params and isinstance(model, Pipeline): + fit_params = prefix_dict_elems(fit_params, "est__") + + model.fit(train_df, target.loc[train_idx], **fit_params) if model is not None: # get number of the best iteration try: @@ -478,9 +488,9 @@ def train_and_evaluate_model( if best_iter: print(f"es_best_iter: {best_iter:_}") except Exception as e: - logger.warning(e) + logger.warning(e) - metrics={'train':{},'val':{},'test':{}} + metrics={'train':{},'val':{},'test':{},'best_iter':best_iter} if compute_trainset_metrics or compute_valset_metrics or compute_testset_metrics: if compute_trainset_metrics and train_idx is not None: if df is None: @@ -511,7 +521,7 @@ def train_and_evaluate_model( metrics=metrics['train'] ) - if compute_valset_metrics and val_idx is not None: + if compute_valset_metrics and val_idx is not None and len(val_idx)>0: if df is None: val_df = None columns = [] @@ -540,7 +550,7 @@ def train_and_evaluate_model( metrics=metrics['val'] ) - if compute_testset_metrics and test_idx is not None: + if compute_testset_metrics and test_idx is not None and len(test_idx)>0: if df is not None: del train_df @@ -733,8 +743,8 @@ def report_regression_model_perf( if subgroups: robustness_report = compute_robustness_metrics( subgroups=subgroups, subset_index=subset_index, y_true=targets, y_pred=preds, - metrics={'MAE':mean_absolute_error,'MAPE':mean_absolute_percentage_error}, - metrics_higher_is_better={'MAE':False,'MAPE':False}, + metrics={'MAE':mean_absolute_error,'RMSE':root_mean_squared_error}, + metrics_higher_is_better={'MAE':False,'RMSE':False}, ) if robustness_report is not None: if print_report: @@ -863,11 +873,11 @@ def report_probabilistic_model_perf( print(classification_report(targets, preds, zero_division=0, digits=report_ndigits)) print(f"ROC AUCs: {', '.join(roc_aucs)}") print(f"PR AUCs: {', '.join(pr_aucs)}") - print(f"CALIBRATIONS: \n{', '.join(calibs)}") - print(f"BRIER LOSS: \n\t{', '.join(brs)}") - print(f"ICE: \n\t{', '.join(integral_errors)}") + print(f"CALIBRATIONs: \n{', '.join(calibs)}") + print(f"BRIER LOSSes: \n\t{', '.join(brs)}") + print(f"ICEs: \n\t{', '.join(integral_errors)}") if custom_ice_metric!=custom_rice_metric: - print(f"RICE: \n\t{', '.join(robust_integral_errors)}") + print(f"RICEs: \n\t{', '.join(robust_integral_errors)}") print(f"TOTAL INTEGRAL ERROR: {integral_error:.4f}") if custom_rice_metric and custom_rice_metric!=custom_ice_metric: @@ -878,7 +888,7 @@ def report_probabilistic_model_perf( subgroups_metrics={'ICE':custom_ice_metric} metrics_higher_is_better={'ICE':False} - if probs.shape[1]>2: + if probs.shape[1]==2: subgroups_metrics['ROC AUC']=fast_roc_auc metrics_higher_is_better['ROC AUC']=True @@ -957,7 +967,7 @@ def configure_training_params(df:pd.DataFrame,target:pd.Series,train_idx:np.ndar common_params=dict(nbins=nbins,subgroups=subgroups,sample_weight=sample_weight,df=df,target=target,train_idx=train_idx,test_idx=test_idx,val_idx=val_idx,target_label_encoder=target_label_encoder,custom_ice_metric=configs.integral_calibration_error,custom_rice_metric=configs.final_integral_calibration_error) - common_cb_params=dict(model=TransformedTargetRegressor(CatBoostRegressor(**configs.CB_REGR),transformer=PowerTransformer()) if use_regression else CatBoostClassifier(**configs.CB_CALIB_CLASSIF),fit_params=dict(plot=verbose,cat_features=cat_features)) + common_cb_params=dict(model=CatBoostRegressor(**configs.CB_REGR) if use_regression else CatBoostClassifier(**configs.CB_CALIB_CLASSIF),fit_params=dict(plot=verbose,cat_features=cat_features)) # TransformedTargetRegressor(CatBoostRegressor(**configs.CB_REGR),transformer=PowerTransformer()) common_xgb_params=dict(model=XGBRegressor(**configs.XGB_GENERAL_PARAMS) if use_regression else XGBClassifier(**configs.XGB_CALIB_CLASSIF),fit_params=dict(verbose=False)) diff --git a/mlframe/tuning.py b/tuning.py similarity index 100% rename from mlframe/tuning.py rename to tuning.py diff --git a/mlframe/unittest_arrays.py b/unittest_arrays.py similarity index 100% rename from mlframe/unittest_arrays.py rename to unittest_arrays.py diff --git a/mlframe/utils.py b/utils.py similarity index 100% rename from mlframe/utils.py rename to utils.py diff --git a/mlframe/version.py b/version.py similarity index 100% rename from mlframe/version.py rename to version.py diff --git a/mlframe/votenrank/__init__.py b/votenrank/__init__.py similarity index 100% rename from mlframe/votenrank/__init__.py rename to votenrank/__init__.py diff --git a/mlframe/votenrank/data_processing.py b/votenrank/data_processing.py similarity index 100% rename from mlframe/votenrank/data_processing.py rename to votenrank/data_processing.py diff --git a/mlframe/votenrank/fairness_computation.py b/votenrank/fairness_computation.py similarity index 100% rename from mlframe/votenrank/fairness_computation.py rename to votenrank/fairness_computation.py diff --git a/mlframe/votenrank/iia_exp.py b/votenrank/iia_exp.py similarity index 100% rename from mlframe/votenrank/iia_exp.py rename to votenrank/iia_exp.py diff --git a/mlframe/votenrank/leaderboard/Leaderboard.py b/votenrank/leaderboard/Leaderboard.py similarity index 100% rename from mlframe/votenrank/leaderboard/Leaderboard.py rename to votenrank/leaderboard/Leaderboard.py diff --git a/mlframe/votenrank/leaderboard/__init__.py b/votenrank/leaderboard/__init__.py similarity index 100% rename from mlframe/votenrank/leaderboard/__init__.py rename to votenrank/leaderboard/__init__.py diff --git a/mlframe/votenrank/leaderboard/_cw.py b/votenrank/leaderboard/_cw.py similarity index 100% rename from mlframe/votenrank/leaderboard/_cw.py rename to votenrank/leaderboard/_cw.py diff --git a/mlframe/votenrank/leaderboard/_rules.py b/votenrank/leaderboard/_rules.py similarity index 100% rename from mlframe/votenrank/leaderboard/_rules.py rename to votenrank/leaderboard/_rules.py diff --git a/mlframe/votenrank/leaderboard/settings.py b/votenrank/leaderboard/settings.py similarity index 100% rename from mlframe/votenrank/leaderboard/settings.py rename to votenrank/leaderboard/settings.py diff --git a/mlframe/votenrank/stability_exp.py b/votenrank/stability_exp.py similarity index 100% rename from mlframe/votenrank/stability_exp.py rename to votenrank/stability_exp.py diff --git a/mlframe/votenrank/utils.py b/votenrank/utils.py similarity index 100% rename from mlframe/votenrank/utils.py rename to votenrank/utils.py