From 6e6aab5cf40784731b3267aef6f991ebbb90597c Mon Sep 17 00:00:00 2001 From: fingoldo Date: Wed, 30 Aug 2023 13:08:39 +0300 Subject: [PATCH] --- Helpers.py | 26 +++++++++++--- calibration.py | 2 +- custom_estimators.py | 2 +- eda.py | 2 +- ewma.py | 2 +- feature_cleaning.py | 2 +- feature_engineering/basic.py | 2 +- feature_engineering/numerical.py | 2 +- feature_engineering/timeseries.py | 2 +- metrics.py | 59 ++++++++++++++++++++++++++++++- preprocessing.py | 16 ++++----- stats.py | 2 +- tuning.py | 2 +- 13 files changed, 98 insertions(+), 23 deletions(-) diff --git a/Helpers.py b/Helpers.py index c5d2ca6..c80eb63 100644 --- a/Helpers.py +++ b/Helpers.py @@ -16,8 +16,10 @@ # ----------------------------------------------------------------------------------------------------------------------------------------------------- from typing import * # noqa: F401 pylint: disable=wildcard-import,unused-wildcard-import +from enum import Enum import mlflow + ######################################################################################################################################################################################################################################## # Helper functions ######################################################################################################################################################################################################################################## @@ -115,19 +117,29 @@ def embed_website_to_mlflow(url:str,fname:str="url",extension:str='.html',width: with open(fname+extension, "w") as f: f.write(website_embed) -def get_or_create_mlflow_run(run_name: str, parent_run_id: str = None, experiment_name: str = None, experiment_id: str = None) -> Tuple[object, bool]: +def get_or_create_mlflow_run(run_name: str, parent_run_id: str = None, experiment_name: str = None, experiment_id: str = None,tags:dict={}) -> Tuple[object, bool]: """Tries to find a run by name within current mlflow experiment. If not found, creates new one. """ - runs = mlflow.search_runs(experiment_names=[experiment_name], filter_string=f'run_name = "{run_name}"', output_format="list") + filter_string=f'run_name = "{run_name}"' + if parent_run_id: + filter_string+=f' and tag.mlflow.parentRunId = "{parent_run_id}"' + + runs = mlflow.search_runs(experiment_names=[experiment_name], filter_string=filter_string, output_format="list",) if runs: for run in runs: return run, True else: if experiment_name: mlflow.set_experiment(experiment_name=experiment_name) + run_tags={"mlflow.parentRunId": parent_run_id} if parent_run_id else None + if tags: + if run_tags is None: + run_tags=tags + else: + run_tags.update(tags) run = mlflow.start_run( - run_name=run_name, experiment_id=experiment_id, tags={"mlflow.parentRunId": parent_run_id} if parent_run_id else None + run_name=run_name, experiment_id=experiment_id, tags=run_tags ) # parent_run.info.run_id mlflow.end_run() return run, False @@ -136,7 +148,13 @@ def create_mlflow_run_label(params: dict, category: str = None) -> str: label = [] for key, value in params.items(): if value: - label.append(f"{key}={value}") + if isinstance(value, Enum): + label.append(f"{key}={value.name}") + else: + if type(value) == type: + label.append(f"{key}={value.__name__}") + else: + label.append(f"{key}={value}") label = ",".join(label) if category: label = f"{category}:{label}" diff --git a/calibration.py b/calibration.py index 869c254..3ec6826 100644 --- a/calibration.py +++ b/calibration.py @@ -18,7 +18,7 @@ from pyutilz.pythonlib import ensure_installed -ensure_installed("pandas numpy properscoring scikit-learn") +# ensure_installed("pandas numpy properscoring") # scikit-learn # ---------------------------------------------------------------------------------------------------------------------------- # Normal Imports diff --git a/custom_estimators.py b/custom_estimators.py index b500846..02da50b 100644 --- a/custom_estimators.py +++ b/custom_estimators.py @@ -14,7 +14,7 @@ from pyutilz.pythonlib import ensure_installed -ensure_installed("numpy pandas scikit-learn") +# ensure_installed("numpy pandas") # scikit-learn # ---------------------------------------------------------------------------------------------------------------------------- # Normal Imports diff --git a/eda.py b/eda.py index 64b6426..94b740b 100644 --- a/eda.py +++ b/eda.py @@ -16,7 +16,7 @@ from pyutilz.pythonlib import ensure_installed # lint: disable=ungrouped-imports,disable=wrong-import-order -ensure_installed("pandas") +# ensure_installed("pandas") # ---------------------------------------------------------------------------------------------------------------------------- # Normal Imports diff --git a/ewma.py b/ewma.py index 49b6bcc..4977453 100644 --- a/ewma.py +++ b/ewma.py @@ -14,7 +14,7 @@ from pyutilz.pythonlib import ensure_installed -ensure_installed("numpy") +# ensure_installed("numpy") # ---------------------------------------------------------------------------------------------------------------------------- # Normal Imports diff --git a/feature_cleaning.py b/feature_cleaning.py index 54946c0..0205fde 100644 --- a/feature_cleaning.py +++ b/feature_cleaning.py @@ -24,7 +24,7 @@ from pyutilz.pythonlib import ensure_installed # lint: disable=ungrouped-imports,disable=wrong-import-order -ensure_installed("numpy pandas psutil") +# ensure_installed("numpy pandas psutil") # ----------------------------------------------------------------------------------------------------------------------------------------------------- # Normal Imports diff --git a/feature_engineering/basic.py b/feature_engineering/basic.py index 665870c..e3bb05a 100644 --- a/feature_engineering/basic.py +++ b/feature_engineering/basic.py @@ -16,7 +16,7 @@ from pyutilz.pythonlib import ensure_installed # lint: disable=ungrouped-imports,disable=wrong-import-order -ensure_installed("numpy pandas") +# ensure_installed("numpy pandas") # ---------------------------------------------------------------------------------------------------------------------------- # Normal Imports diff --git a/feature_engineering/numerical.py b/feature_engineering/numerical.py index 2abb17a..92671ef 100644 --- a/feature_engineering/numerical.py +++ b/feature_engineering/numerical.py @@ -18,7 +18,7 @@ ensure_installed, ) # lint: disable=ungrouped-imports,disable=wrong-import-order -ensure_installed("numpy numba sklearn antropy entropy_estimators") # npeet? +# ensure_installed("numpy numba sklearn antropy entropy_estimators") # npeet? # ---------------------------------------------------------------------------------------------------------------------------- # Normal Imports diff --git a/feature_engineering/timeseries.py b/feature_engineering/timeseries.py index f710f99..79d982d 100644 --- a/feature_engineering/timeseries.py +++ b/feature_engineering/timeseries.py @@ -12,7 +12,7 @@ from pyutilz.pythonlib import ensure_installed -ensure_installed("numpy pandas PyWavelets") +# ensure_installed("numpy pandas") # PyWavelets # ---------------------------------------------------------------------------------------------------------------------------- # Normal Imports diff --git a/metrics.py b/metrics.py index 0ad4b5e..92664f1 100644 --- a/metrics.py +++ b/metrics.py @@ -1,8 +1,16 @@ -import numpy as np, pandas as pd +# ---------------------------------------------------------------------------------------------------------------------------- +# Normal Imports +# ---------------------------------------------------------------------------------------------------------------------------- + +from typing import * from numba import njit from math import floor +import numpy as np, pandas as pd from matplotlib import pyplot as plt +# ---------------------------------------------------------------------------------------------------------------------------- +# Core +# ---------------------------------------------------------------------------------------------------------------------------- def fast_auc(y_true: np.array, y_score: np.array) -> float: """np.argsort needs to stay out of njitted func.""" @@ -183,3 +191,52 @@ def predictions_time_instability(preds: pd.Series) -> float: For binary classification instability ranges from 0 to 1, for regression from 0 to any value depending on the target stats. """ return np.abs(np.diff(preds)).mean() + + +# ---------------------------------------------------------------------------------------------------------------------------- +# Errors & scorers +# ---------------------------------------------------------------------------------------------------------------------------- + + +class CB_CALIB_ERROR: + def is_max_optimal(self): + return False # greater is better + + def evaluate(self, approxes, target, weight): + output_weight = 1 # weight is not used + + # predictions=expit(approxes[0]) + predictions = 1 / (1 + np.exp(-approxes[0])) + + calibration_mae, calibration_std = fast_calibration_metrics(y_true=target, y_pred=predictions) + return calibration_mae + calibration_std / 10, output_weight + + def get_final_error(self, error, weight): + return error + + +class CB_PRECISION: + def is_max_optimal(self): + return False # greater is better + + def evaluate(self, approxes, target, weight): + output_weight = 1 # weight is not used + + # predictions=expit(approxes[0]) + predictions = 1 / (1 + np.exp(-approxes[0])) + + return fast_precision(y_true=target, y_pred=(predictions >= 0.5).astype(np.int8), zero_division=0), output_weight + + def get_final_error(self, error, weight): + return error + + +def calib_error(labels: np.ndarray, predt: np.ndarray) -> float: + """Calibration error.""" + + calibration_mae, calibration_std = fast_calibration_metrics(y_true=labels, y_pred=predt) + return calibration_mae + calibration_std / 10 + + +def calib_error_keras(labels: np.ndarray, predt: np.ndarray) -> float: + return calib_error(labels=labels.numpy()[:, -1], predt=predt.numpy()[:, -1],) \ No newline at end of file diff --git a/preprocessing.py b/preprocessing.py index e8be4b4..02e09a7 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -8,16 +8,17 @@ from pyutilz.system import tqdmu -def prepare_df_for_catboost(df: object, columns_to_drop: Sequence = [], text_features: Sequence = [], cat_features: list = [], na_filler: str = "") -> None: +def prepare_df_for_catboost(df: object, columns_to_drop: Sequence = [], text_features: Sequence = [], cat_features: list = [], na_filler: str = "",ensure_categorical:bool=True,verbose:bool=False) -> None: """ - Catboost needs NAs replaced by a string value. + Catboost needs NAs in cat features replaced by a string value. Possibly extends cat_features list. + ensure_categorical:bool=True makes further processing also suitable for xgboost. """ cols = set(df.columns) for var in tqdmu(text_features, desc="Processing textual features for CatBoost...", leave=False): - if var in cols: - if var not in columns_to_drop: + if var in cols and var not in columns_to_drop: + if df[var].isna().any(): df[var] = df[var].fillna(na_filler) for var in tqdmu(cols, desc="Processing categorical features for CatBoost...", leave=False): @@ -25,14 +26,13 @@ def prepare_df_for_catboost(df: object, columns_to_drop: Sequence = [], text_fea if df[var].isna().any(): df[var] = df[var].astype(str).fillna(na_filler).astype('category') if var not in cat_features: - logging.info(f"{var} appended to cat_features") - #df[var] = df[var].astype(str) #(?) + if verbose: logging.info(f"{var} appended to cat_features") cat_features.append(var) else: if var in cat_features: if df[var].isna().any(): df[var] = df[var].fillna(na_filler) - df[var] = df[var].astype('category') + if ensure_categorical: df[var] = df[var].astype('category') def prepare_df_for_xgboost(df: object, cat_features: Sequence = [], ) -> None: @@ -47,5 +47,5 @@ def prepare_df_for_xgboost(df: object, cat_features: Sequence = [], ) -> None: #df[var] = df[var].astype(str) #(?) cat_features.append(var) else: - if var in cat_features: + if var in cat_features and ensure_categorical: df[var] = df[var].astype('category') \ No newline at end of file diff --git a/stats.py b/stats.py index c7bfa01..b657a7a 100644 --- a/stats.py +++ b/stats.py @@ -16,7 +16,7 @@ from pyutilz.pythonlib import ensure_installed # lint: disable=ungrouped-imports,disable=wrong-import-order -ensure_installed("numpy scipy") +# ensure_installed("numpy scipy") # ---------------------------------------------------------------------------------------------------------------------------- # Normal Imports diff --git a/tuning.py b/tuning.py index c26832c..85b4e8f 100644 --- a/tuning.py +++ b/tuning.py @@ -16,7 +16,7 @@ from pyutilz.pythonlib import ensure_installed -ensure_installed("pandas numpy scipy") +# ensure_installed("pandas numpy scipy") # ---------------------------------------------------------------------------------------------------------------------------- # Normal Imports