No commit message

fingoldo · Oct 11, 2024 · 5e21604 · 5e21604
1 parent 79ba8e6
commit 5e21604
Show file tree

Hide file tree

Showing 70 changed files with 226 additions and 85 deletions.
diff --git a/mlframe/Backtesting.py → Backtesting.py b/mlframe/Backtesting.py → Backtesting.py
diff --git a/mlframe/Data.py → Data.py b/mlframe/Data.py → Data.py
diff --git a/mlframe/FeatureEngineering.py → FeatureEngineering.py b/mlframe/FeatureEngineering.py → FeatureEngineering.py
diff --git a/mlframe/Features.py → Features.py b/mlframe/Features.py → Features.py
diff --git a/mlframe/Models.py → Models.py b/mlframe/Models.py → Models.py
diff --git a/mlframe/OldEnsembling.py → OldEnsembling.py b/mlframe/OldEnsembling.py → OldEnsembling.py
diff --git a/mlframe/__init__.py → __init__.py b/mlframe/__init__.py → __init__.py
diff --git a/mlframe/arrays.py → arrays.py b/mlframe/arrays.py → arrays.py
diff --git a/mlframe/baselines.py → baselines.py b/mlframe/baselines.py → baselines.py
diff --git a/mlframe/boruta_shap.py → boruta_shap.py b/mlframe/boruta_shap.py → boruta_shap.py
diff --git a/mlframe/calibration.py → calibration.py b/mlframe/calibration.py → calibration.py
diff --git a/mlframe/cluster.py → cluster.py b/mlframe/cluster.py → cluster.py
diff --git a/mlframe/config.py → config.py b/mlframe/config.py → config.py
diff --git a/mlframe/core.py → core.py b/mlframe/core.py → core.py
diff --git a/mlframe/custom_estimators.py → custom_estimators.py b/mlframe/custom_estimators.py → custom_estimators.py
@@ -449,3 +449,46 @@ def clip_to_quantiles_winsor_quantile(arr):
 
 def clip_to_quantiles_hard(arr):
     return clip_to_quantiles(arr, quantile=0.01, method="hard")
+
+
+
+class IdentityEstimator(BaseEstimator):
+    """Just returns some if the existing featurs as-is instead of real learning & predicting.
+    Good to check via ML metrics decisions of other methods/models.
+    """
+
+    def __init__(self,feature_names:list=None,feature_indices:list=None):
+        self.feature_names=feature_names
+        self.feature_indices=feature_indices
+
+    def fit(self, X, y, **fit_params):
+        if isinstance(self, ClassifierMixin):
+            if isinstance(y, pd.Series):
+                self.classes_ = sorted(y.unique())
+            else:
+                self.classes_ = sorted(np.unique(y))
+        return self
+
+    def predict(self, X):
+        if isinstance(X, (pd.DataFrame, pd.Series)):
+            if self.feature_names:
+                return X.loc[:, self.feature_names].values
+            else:
+                assert self.feature_indices is not None
+                return X.iloc[:, self.feature_indices].values
+        else:
+            assert self.feature_indices is not None
+            return X[:, self.feature_indices]
+
+
+class IdentityRegressor(IdentityEstimator, RegressorMixin):
+    pass
+
+
+class IdentityClassifier(IdentityEstimator, ClassifierMixin):
+    def predict_proba(self, X):
+        last_class_probs = self.predict(X)
+        if len(self.classes_) == 2 and last_class_probs.ndim==1:
+            return np.vstack([1 - last_class_probs, last_class_probs]).T
+        else:
+            return last_class_probs
diff --git a/datasets.py b/datasets.py
@@ -0,0 +1,72 @@
+# ----------------------------------------------------------------------------------------------------------------------------
+# LOGGING
+# ----------------------------------------------------------------------------------------------------------------------------
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# Normal Imports
+# ----------------------------------------------------------------------------------------------------------------------------
+
+from typing import *
+
+import scipy
+from scipy import stats
+from scipy.stats import norm
+import numpy as np, pandas as pd
+
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# Core
+# ----------------------------------------------------------------------------------------------------------------------------
+
+
+def I(cond: np.ndarray) -> np.ndarray:
+    # Indicator function
+    return cond.astype(int)
+
+
+def get_sapp_dataset(
+    loc: float = 0.0,
+    scale: float = 9.0,
+    distr_name: str = "norm",
+    distr_params: tuple = (),
+    N: int = 1000,
+    add_error: bool = False,
+    random_state: int = 42,
+    dtype=np.float32,
+    binarize: bool = True,
+) -> Tuple[pd.DataFrame, np.ndarray]:
+    """Used in work
+    Subsemble: an ensemble method for combining subset-specific algorithm fits
+    Stephanie Sapp, Mark J. van der Laan, and John Canny
+    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4000126/pdf/nihms-539092.pdf
+    """
+
+    np.random.seed(random_state)
+
+    df = pd.DataFrame()
+    for i in range(20):
+        df[f"X{i+1}"] = getattr(stats, distr_name).rvs(loc, scale, *distr_params, size=N)
+
+    target = df.eval(
+        "X1+sin(X2)+log(abs(X3))+X4**2+X5*X6 +@I((X7*X8*X9)<0)+@I(X10>0)+X11*@I(X11>0)+sqrt(abs(X12)) +cos(X13)+2*X14+abs(X15)+@I(X16<-1)+X17*@I(X17<-1)-2*X18-X19*X20"
+    )
+
+    if add_error:
+        target += getattr(stats, distr_name).rvs(loc, scale, *distr_params, size=N)
+    if binarize:
+        target = (target > target.mean()).astype(np.int8)
+    return df.astype(dtype), target.astype(dtype)
+
+
+def showcase_pycaret_datasets():
+
+    from pycaret.datasets import get_data
+
+    df = get_data(verbose=False)
+    df["# Instances"] = df["# Instances"].astype(np.int32)
+    return df.sort_values("# Instances").tail(20).reset_index(drop=True)
diff --git a/mlframe/early_stopping.py → early_stopping.py b/mlframe/early_stopping.py → early_stopping.py
diff --git a/mlframe/eda.py → eda.py b/mlframe/eda.py → eda.py
diff --git a/mlframe/ensembling.py → ensembling.py b/mlframe/ensembling.py → ensembling.py
diff --git a/mlframe/estimators.py → estimators.py b/mlframe/estimators.py → estimators.py
diff --git a/mlframe/evaluation.py → evaluation.py b/mlframe/evaluation.py → evaluation.py
diff --git a/mlframe/ewma.py → ewma.py b/mlframe/ewma.py → ewma.py
diff --git a/mlframe/experiments.py → experiments.py b/mlframe/experiments.py → experiments.py
diff --git a/mlframe/explainability.py → explainability.py b/mlframe/explainability.py → explainability.py
diff --git a/mlframe/feature_cleaning.py → feature_cleaning.py b/mlframe/feature_cleaning.py → feature_cleaning.py
diff --git a/mlframe/feature_engineering/__init__.py → feature_engineering/__init__.py b/mlframe/feature_engineering/__init__.py → feature_engineering/__init__.py
diff --git a/mlframe/feature_engineering/basic.py → feature_engineering/basic.py b/mlframe/feature_engineering/basic.py → feature_engineering/basic.py
diff --git a/mlframe/feature_engineering/categorical.py → feature_engineering/categorical.py b/mlframe/feature_engineering/categorical.py → feature_engineering/categorical.py
diff --git a/mlframe/feature_engineering/hurst.py → feature_engineering/hurst.py b/mlframe/feature_engineering/hurst.py → feature_engineering/hurst.py
diff --git a/mlframe/feature_engineering/numerical.py → feature_engineering/numerical.py b/mlframe/feature_engineering/numerical.py → feature_engineering/numerical.py
@@ -973,7 +973,7 @@ def get_numaggs_names(
         )
         + ([] if (directional_only or not return_unsorted_stats) else "nuniques,modmin,modmax,modmean,modqty".split(","))
         + ([] if directional_only else (["q" + str(q) for q in q]))
-        + ([] if not return_unsorted_stats else ["ncrs" + str(q) for q in q])
+        + ([] if (directional_only or not return_unsorted_stats) else ["ncrs" + str(q) for q in q])
         + get_moments_slope_mi_feature_names(weights=weights, directional_only=directional_only, return_lintrend_approx_stats=return_lintrend_approx_stats)
         # + ["mutual_info_regression",]
         + (["hursth", "hurstc"] if return_hurst else [])

diff --git a/mlframe/feature_engineering/timeseries.py → feature_engineering/timeseries.py b/mlframe/feature_engineering/timeseries.py → feature_engineering/timeseries.py
diff --git a/mlframe/feature_importance.py → feature_importance.py b/mlframe/feature_importance.py → feature_importance.py
diff --git a/mlframe/feature_selection/__init__.py → feature_selection/__init__.py b/mlframe/feature_selection/__init__.py → feature_selection/__init__.py
diff --git a/mlframe/feature_selection/filters.py → feature_selection/filters.py b/mlframe/feature_selection/filters.py → feature_selection/filters.py
@@ -2465,7 +2465,7 @@ def discretize_array(
     return quantize_search(arr, bins_edges).astype(dtype)  # njitted
 
 
-@njit(parallel=True)
+#@njit(parallel=True)
 def discretize_2d_array(
     arr: np.ndarray,
     n_bins: int = 10,
@@ -2479,7 +2479,7 @@ def discretize_2d_array(
 
     res = np.empty_like(arr, dtype=dtype)
 
-    for col in prange(arr.shape[1]):
+    for col in tqdmu(arr.shape[1],desc='col',leave=False): # prange
         res[:, col] = discretize_array(
             arr=arr[:, col],
             n_bins=n_bins,

diff --git a/mlframe/feature_selection/wrappers.py → feature_selection/wrappers.py b/mlframe/feature_selection/wrappers.py → feature_selection/wrappers.py
diff --git a/mlframe/helpers.py → helpers.py b/mlframe/helpers.py → helpers.py
diff --git a/mlframe/inference.py → inference.py b/mlframe/inference.py → inference.py
diff --git a/mlframe/keras.py → keras.py b/mlframe/keras.py → keras.py
diff --git a/mlframe/lightninglib.py → lightninglib.py b/mlframe/lightninglib.py → lightninglib.py
@@ -145,36 +145,6 @@ def predict_proba(self, X):
         return self.predict(X)
 
 
-class IdentityEstimator(BaseEstimator):
-    """Just returns the 1st feature as-is instead of real learning & predicting."""
-
-    def fit(self, X, y, **fit_params):
-        if isinstance(self, ClassifierMixin):
-            if isinstance(y, pd.Series):
-                self.classes_ = sorted(y.unique())
-            else:
-                self.classes_ = sorted(np.unique(y))
-        return self
-
-    def predict(self, X):
-        if isinstance(X, (pd.DataFrame, pd.Series)):
-            X = X.to_numpy()
-        return X[:, 0]
-
-
-class IdentityRegressor(IdentityEstimator, RegressorMixin):
-    pass
-
-
-class IdentityClassifier(IdentityEstimator, ClassifierMixin):
-    def predict_proba(self, X):
-        last_class_probs = self.predict(X)
-        if len(self.classes_) == 2:
-            return np.vstack([1 - last_class_probs, last_class_probs]).T
-        else:
-            return last_class_probs
-
-
 # ----------------------------------------------------------------------------------------------------------------------------
 # Data
 # ----------------------------------------------------------------------------------------------------------------------------

diff --git a/mlframe/metrics.py → metrics.py b/mlframe/metrics.py → metrics.py
@@ -554,11 +554,7 @@ def integral_calibration_error_from_metrics(
     ICE is a weighted sum of baseline losses-"roc_auc goodness over 0.5".
     If roc_auc is not good enough, it incurs additional penalty.
     """
-    res = (
-        brier_loss * brier_loss_weight
-        + (calibration_mae * mae_weight + calibration_std * std_weight) * np.abs(roc_auc - 0.5)
-        - np.abs(roc_auc - 0.5) * roc_auc_weight
-    )
+    res = brier_loss * brier_loss_weight + calibration_mae * mae_weight + calibration_std * std_weight - np.abs(roc_auc - 0.5) * roc_auc_weight
     if np.abs(roc_auc - 0.5) < (min_roc_auc - 0.5):
         res += roc_auc_penalty
     return res
@@ -676,11 +672,13 @@ def create_robustness_subgroups(
     return subgroups
 
 
-def create_robustness_subgroups_indices(subgroups: dict, train_idx: np.ndarray, val_idx: np.ndarray, test_idx: np.ndarray, group_weights: dict = {}, cont_nbins: int = 3) -> dict:
+def create_robustness_subgroups_indices(
+    subgroups: dict, train_idx: np.ndarray, val_idx: np.ndarray, test_idx: np.ndarray, group_weights: dict = {}, cont_nbins: int = 3
+) -> dict:
     res = {}
-    if len(val_idx)==len(test_idx):
+    if len(val_idx) == len(test_idx):
         logger.warning(f"Validation and test sets have the same size. Robustness subgroups estimation will be incorrect.")
-    for arr in (train_idx, test_idx,val_idx):
+    for arr in (train_idx, test_idx, val_idx):
         npoints = len(arr)
         robustness_subgroups_indices = {}
         for group_name, group_params in subgroups.items():
@@ -841,6 +839,7 @@ def robust_mlperf_metric(
     higher_is_better: bool,
     subgroups: dict = None,
     whole_set_weight: float = 0.5,
+    min_group_size: int = 100,
 ) -> float:
     """Bins idices need to be aware of arr sizes: boostings can call the metric on
     multiple sets of differnt lengths - train, val, etc. Arrays will be pure numpy, so no other means to
@@ -859,6 +858,8 @@ def robust_mlperf_metric(
 
             perfs = []
             for bin_name, bin_indices in bins.items():
+                if len(bin_indices) < min_group_size:
+                    continue
                 if isinstance(y_score, Sequence):
                     if len(y_score) == 2:
                         metric_value = metric(y_true[bin_indices], [el[bin_indices] for el in y_score])
@@ -871,14 +872,15 @@ def robust_mlperf_metric(
                         metric_value = metric(y_true[bin_indices], y_score[bin_indices])
                 perfs.append(metric_value)
 
-            perfs = np.array(perfs)
-            bin_metric_value = perfs.mean()
-            if higher_is_better:
-                bin_metric_value -= perfs.std()
-            else:
-                bin_metric_value += perfs.std()
+            if perfs:
+                perfs = np.array(perfs)
+                bin_metric_value = perfs.mean()
+                if higher_is_better:
+                    bin_metric_value -= perfs.std()
+                else:
+                    bin_metric_value += perfs.std()
 
-            weights_sum += bin_weight
-            total_metric_value += bin_metric_value * bin_weight
+                weights_sum += bin_weight
+                total_metric_value += bin_metric_value * bin_weight
 
     return total_metric_value / weights_sum
diff --git a/mlframe/mlflowlib.py → mlflowlib.py b/mlframe/mlflowlib.py → mlflowlib.py
diff --git a/mlframe/model_selection.py → model_selection.py b/mlframe/model_selection.py → model_selection.py
diff --git a/mlframe/optimization.py → optimization.py b/mlframe/optimization.py → optimization.py
@@ -521,6 +521,7 @@ def submit_evaluations(self, candidates: Sequence, evaluations: Sequence, durati
                     y_label=self.y_label,
                     expected_fitness_color=self.expected_fitness_color,
                     legend_location=self.legend_location,
+                    skip_candidates=[0],
                 )
 
 
@@ -719,6 +720,7 @@ def plot_search_state(
     ground_truth: np.ndarray,
     known_candidates: np.ndarray,
     known_evaluations: np.ndarray,
+    skip_candidates: Sequence,
     acquisition_method: str,
     mode: str,
     additional_info: str,
@@ -752,8 +754,14 @@ def plot_search_state(
     if y_pred is not None:
         axMain.plot(search_space, y_pred, color="red", linestyle="dashed", label="Surrogate Function")
         axMain.fill_between(search_space, y_pred - y_std, y_pred + y_std, color="blue", alpha=0.2)
+
     axMain.scatter(known_candidates, known_evaluations, color="blue", label="Known Points")
 
+    if skip_candidates:
+        idx = ~np.isin(known_candidates, skip_candidates)
+        if idx.sum() > 0:
+            axMain.set_ylim([known_evaluations[idx].min(), None])
+
     axExpectedFitness.set_yticklabels([])
     axExpectedFitness.set_yticks([])
     axExpectedFitness.set_ylabel(acquisition_method, color=expected_fitness_color)

diff --git a/mlframe/outliers.py → outliers.py b/mlframe/outliers.py → outliers.py
diff --git a/mlframe/pipelines.py → pipelines.py b/mlframe/pipelines.py → pipelines.py
diff --git a/mlframe/preprocessing.py → preprocessing.py b/mlframe/preprocessing.py → preprocessing.py
diff --git a/probabilities.py b/probabilities.py
@@ -0,0 +1,51 @@
+import numpy as np
+from numba import njit
+
+
+@njit()
+def generate_probs_from_outcomes(
+    outcomes: np.ndarray, chunk_size: int = 20, scale: float = 0.1, nbins: int = 10, bins_std: float = 0.1, flip_percent: float = 0.6
+) -> np.ndarray:
+    """Can we generate hypothetical ground truth probs knowing the outcomes in advance?
+    Our model probs will (hopefully) be calibrated. So, we need synthetic probs to be calibrated, too. With some degree of fitness.
+    We also need to cover broad range of probs.
+    So, how to achieve this?
+
+    0)  if flip_percent is specified, for a random portion of data zeroes and ones are flipped. this will lower ROC AUC.
+    1) we can work with small random chunks/subsets of data
+    2) for every chunk, its real freq is computed.
+    3) for every observation, 'exact' prob is drawn from some distribution (uniform or, say, gaussian) with center in real freq.
+    then, if bins_std is specified, constant bin noise is applied to all observations of the chunk.
+
+    final result is clipped to [0,1]
+    """
+    n = len(outcomes)
+    indices = np.arange(n)
+    np.random.shuffle(indices)
+
+    probs = np.empty(n, dtype=np.float32)
+    bin_offsets = (np.random.random(size=nbins) - 0.5) * bins_std
+
+    if flip_percent:
+        # flip some bits to worsen our so far perfect predictive power
+        flip_size = int(n * flip_percent)
+        if flip_size:
+            outcomes = outcomes.copy()
+            flip_indices = np.random.choice(indices, size=flip_size)
+            outcomes[flip_indices] = 1 - outcomes[flip_indices]
+
+    l = 0  # left border
+    for idx in range(n // chunk_size):  # traverse randomly selected chunks/subsets of original data
+        r = (idx + 1) * chunk_size  # right border
+        freq = outcomes[l:r].mean()  # find real event occuring frequency in current chunk of observation
+
+        # add pregenerated offset for particular bin
+        bin_idx = int(freq * nbins)
+        freq = freq + bin_offsets[bin_idx]
+
+        # add small symmetric random noise. it must be higher when freq approaches [0;1] borders.
+        probs[l:r] = freq + (np.random.random(size=chunk_size) - 0.5) * scale * np.abs(freq - 0.5)
+
+        l = r
+
+    return np.clip(probs, 0.0, 1.0)
diff --git a/mlframe/public_suffix_list.dat → public_suffix_list.dat b/mlframe/public_suffix_list.dat → public_suffix_list.dat
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/mlframe/scalers.py → scalers.py b/mlframe/scalers.py → scalers.py
diff --git a/mlframe/stats.py → stats.py b/mlframe/stats.py → stats.py
diff --git a/mlframe/synthetic.py → synthetic.py b/mlframe/synthetic.py → synthetic.py
diff --git a/mlframe/tests.py → tests.py b/mlframe/tests.py → tests.py
diff --git a/mlframe/text.py → text.py b/mlframe/text.py → text.py