From 6e6aab5cf40784731b3267aef6f991ebbb90597c Mon Sep 17 00:00:00 2001
From: fingoldo <fingoldo@gmail.com>
Date: Wed, 30 Aug 2023 13:08:39 +0300
Subject: [PATCH]

---
 Helpers.py                        | 26 +++++++++++---
 calibration.py                    |  2 +-
 custom_estimators.py              |  2 +-
 eda.py                            |  2 +-
 ewma.py                           |  2 +-
 feature_cleaning.py               |  2 +-
 feature_engineering/basic.py      |  2 +-
 feature_engineering/numerical.py  |  2 +-
 feature_engineering/timeseries.py |  2 +-
 metrics.py                        | 59 ++++++++++++++++++++++++++++++-
 preprocessing.py                  | 16 ++++-----
 stats.py                          |  2 +-
 tuning.py                         |  2 +-
 13 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/Helpers.py b/Helpers.py
index c5d2ca6..c80eb63 100644
--- a/Helpers.py
+++ b/Helpers.py
@@ -16,8 +16,10 @@
 # -----------------------------------------------------------------------------------------------------------------------------------------------------
 
 from typing import *  # noqa: F401 pylint: disable=wildcard-import,unused-wildcard-import
+from enum import Enum
 import mlflow
 
+
 ########################################################################################################################################################################################################################################
 # Helper functions
 ########################################################################################################################################################################################################################################
@@ -115,19 +117,29 @@ def embed_website_to_mlflow(url:str,fname:str="url",extension:str='.html',width:
     with open(fname+extension, "w") as f:
         f.write(website_embed)
 
-def get_or_create_mlflow_run(run_name: str, parent_run_id: str = None, experiment_name: str = None, experiment_id: str = None) -> Tuple[object, bool]:
+def get_or_create_mlflow_run(run_name: str, parent_run_id: str = None, experiment_name: str = None, experiment_id: str = None,tags:dict={}) -> Tuple[object, bool]:
     """Tries to find a run by name within current mlflow experiment.
     If not found, creates new one.
     """
-    runs = mlflow.search_runs(experiment_names=[experiment_name], filter_string=f'run_name = "{run_name}"', output_format="list")
+    filter_string=f'run_name = "{run_name}"'
+    if parent_run_id:
+        filter_string+=f' and tag.mlflow.parentRunId = "{parent_run_id}"'
+
+    runs = mlflow.search_runs(experiment_names=[experiment_name], filter_string=filter_string, output_format="list",)
     if runs:
         for run in runs:
             return run, True
     else:
         if experiment_name:
             mlflow.set_experiment(experiment_name=experiment_name)
+        run_tags={"mlflow.parentRunId": parent_run_id} if parent_run_id else None
+        if tags:
+            if run_tags is None:
+                run_tags=tags
+            else:
+                run_tags.update(tags)
         run = mlflow.start_run(
-            run_name=run_name, experiment_id=experiment_id, tags={"mlflow.parentRunId": parent_run_id} if parent_run_id else None
+            run_name=run_name, experiment_id=experiment_id, tags=run_tags
         )  # parent_run.info.run_id
         mlflow.end_run()
         return run, False
@@ -136,7 +148,13 @@ def create_mlflow_run_label(params: dict, category: str = None) -> str:
     label = []
     for key, value in params.items():
         if value:
-            label.append(f"{key}={value}")
+            if isinstance(value, Enum):
+                label.append(f"{key}={value.name}")
+            else:
+                if type(value) == type:
+                    label.append(f"{key}={value.__name__}")
+                else:
+                    label.append(f"{key}={value}")
     label = ",".join(label)
     if category:
         label = f"{category}:{label}"
diff --git a/calibration.py b/calibration.py
index 869c254..3ec6826 100644
--- a/calibration.py
+++ b/calibration.py
@@ -18,7 +18,7 @@
 
 from pyutilz.pythonlib import ensure_installed
 
-ensure_installed("pandas numpy properscoring scikit-learn")
+# ensure_installed("pandas numpy properscoring") #  scikit-learn
 
 # ----------------------------------------------------------------------------------------------------------------------------
 # Normal Imports
diff --git a/custom_estimators.py b/custom_estimators.py
index b500846..02da50b 100644
--- a/custom_estimators.py
+++ b/custom_estimators.py
@@ -14,7 +14,7 @@
 
 from pyutilz.pythonlib import ensure_installed
 
-ensure_installed("numpy pandas scikit-learn")
+# ensure_installed("numpy pandas") #  scikit-learn
 
 # ----------------------------------------------------------------------------------------------------------------------------
 # Normal Imports
diff --git a/eda.py b/eda.py
index 64b6426..94b740b 100644
--- a/eda.py
+++ b/eda.py
@@ -16,7 +16,7 @@
 
 from pyutilz.pythonlib import ensure_installed  # lint: disable=ungrouped-imports,disable=wrong-import-order
 
-ensure_installed("pandas")
+# ensure_installed("pandas")
 
 # ----------------------------------------------------------------------------------------------------------------------------
 # Normal Imports
diff --git a/ewma.py b/ewma.py
index 49b6bcc..4977453 100644
--- a/ewma.py
+++ b/ewma.py
@@ -14,7 +14,7 @@
 
 from pyutilz.pythonlib import ensure_installed
 
-ensure_installed("numpy")
+# ensure_installed("numpy")
 
 # ----------------------------------------------------------------------------------------------------------------------------
 # Normal Imports
diff --git a/feature_cleaning.py b/feature_cleaning.py
index 54946c0..0205fde 100644
--- a/feature_cleaning.py
+++ b/feature_cleaning.py
@@ -24,7 +24,7 @@
 
 from pyutilz.pythonlib import ensure_installed  # lint: disable=ungrouped-imports,disable=wrong-import-order
 
-ensure_installed("numpy pandas psutil")
+# ensure_installed("numpy pandas psutil")
 
 # -----------------------------------------------------------------------------------------------------------------------------------------------------
 # Normal Imports
diff --git a/feature_engineering/basic.py b/feature_engineering/basic.py
index 665870c..e3bb05a 100644
--- a/feature_engineering/basic.py
+++ b/feature_engineering/basic.py
@@ -16,7 +16,7 @@
 
 from pyutilz.pythonlib import ensure_installed  # lint: disable=ungrouped-imports,disable=wrong-import-order
 
-ensure_installed("numpy pandas")
+# ensure_installed("numpy pandas")
 
 # ----------------------------------------------------------------------------------------------------------------------------
 # Normal Imports
diff --git a/feature_engineering/numerical.py b/feature_engineering/numerical.py
index 2abb17a..92671ef 100644
--- a/feature_engineering/numerical.py
+++ b/feature_engineering/numerical.py
@@ -18,7 +18,7 @@
     ensure_installed,
 )  # lint: disable=ungrouped-imports,disable=wrong-import-order
 
-ensure_installed("numpy numba sklearn antropy entropy_estimators")  # npeet?
+# ensure_installed("numpy numba sklearn antropy entropy_estimators")  # npeet?
 
 # ----------------------------------------------------------------------------------------------------------------------------
 # Normal Imports
diff --git a/feature_engineering/timeseries.py b/feature_engineering/timeseries.py
index f710f99..79d982d 100644
--- a/feature_engineering/timeseries.py
+++ b/feature_engineering/timeseries.py
@@ -12,7 +12,7 @@
 
 from pyutilz.pythonlib import ensure_installed
 
-ensure_installed("numpy pandas PyWavelets")
+# ensure_installed("numpy pandas") #  PyWavelets
 
 # ----------------------------------------------------------------------------------------------------------------------------
 # Normal Imports
diff --git a/metrics.py b/metrics.py
index 0ad4b5e..92664f1 100644
--- a/metrics.py
+++ b/metrics.py
@@ -1,8 +1,16 @@
-import numpy as np, pandas as pd
+# ----------------------------------------------------------------------------------------------------------------------------
+# Normal Imports
+# ----------------------------------------------------------------------------------------------------------------------------
+
+from typing import *
 from numba import njit
 from math import floor
+import numpy as np, pandas as pd
 from matplotlib import pyplot as plt
 
+# ----------------------------------------------------------------------------------------------------------------------------
+# Core
+# ----------------------------------------------------------------------------------------------------------------------------
 
 def fast_auc(y_true: np.array, y_score: np.array) -> float:
     """np.argsort needs to stay out of njitted func."""
@@ -183,3 +191,52 @@ def predictions_time_instability(preds: pd.Series) -> float:
     For binary classification instability ranges from 0 to 1, for regression from 0 to any value depending on the target stats.
     """
     return np.abs(np.diff(preds)).mean()
+
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# Errors & scorers
+# ----------------------------------------------------------------------------------------------------------------------------
+
+
+class CB_CALIB_ERROR:
+    def is_max_optimal(self):
+        return False  # greater is better
+
+    def evaluate(self, approxes, target, weight):
+        output_weight = 1  # weight is not used
+
+        # predictions=expit(approxes[0])
+        predictions = 1 / (1 + np.exp(-approxes[0]))
+
+        calibration_mae, calibration_std = fast_calibration_metrics(y_true=target, y_pred=predictions)
+        return calibration_mae + calibration_std / 10, output_weight
+
+    def get_final_error(self, error, weight):
+        return error
+
+
+class CB_PRECISION:
+    def is_max_optimal(self):
+        return False  # greater is better
+
+    def evaluate(self, approxes, target, weight):
+        output_weight = 1  # weight is not used
+
+        # predictions=expit(approxes[0])
+        predictions = 1 / (1 + np.exp(-approxes[0]))
+
+        return fast_precision(y_true=target, y_pred=(predictions >= 0.5).astype(np.int8), zero_division=0), output_weight
+
+    def get_final_error(self, error, weight):
+        return error
+
+
+def calib_error(labels: np.ndarray, predt: np.ndarray) -> float:
+    """Calibration error."""
+
+    calibration_mae, calibration_std = fast_calibration_metrics(y_true=labels, y_pred=predt)
+    return calibration_mae + calibration_std / 10
+
+
+def calib_error_keras(labels: np.ndarray, predt: np.ndarray) -> float:
+    return calib_error(labels=labels.numpy()[:, -1], predt=predt.numpy()[:, -1],)
\ No newline at end of file
diff --git a/preprocessing.py b/preprocessing.py
index e8be4b4..02e09a7 100644
--- a/preprocessing.py
+++ b/preprocessing.py
@@ -8,16 +8,17 @@
 from pyutilz.system import tqdmu
 
 
-def prepare_df_for_catboost(df: object, columns_to_drop: Sequence = [], text_features: Sequence = [], cat_features: list = [], na_filler: str = "") -> None:
+def prepare_df_for_catboost(df: object, columns_to_drop: Sequence = [], text_features: Sequence = [], cat_features: list = [], na_filler: str = "",ensure_categorical:bool=True,verbose:bool=False) -> None:
     """
-    Catboost needs NAs replaced by a string value.
+    Catboost needs NAs in cat features replaced by a string value.
     Possibly extends cat_features list.
+    ensure_categorical:bool=True makes further processing also suitable for xgboost.
     """
     cols = set(df.columns)
 
     for var in tqdmu(text_features, desc="Processing textual features for CatBoost...", leave=False):
-        if var in cols:
-            if var not in columns_to_drop:
+        if var in cols and var not in columns_to_drop:
+            if df[var].isna().any():
                 df[var] = df[var].fillna(na_filler)
 
     for var in tqdmu(cols, desc="Processing categorical features for CatBoost...", leave=False):
@@ -25,14 +26,13 @@ def prepare_df_for_catboost(df: object, columns_to_drop: Sequence = [], text_fea
             if df[var].isna().any():
                 df[var] = df[var].astype(str).fillna(na_filler).astype('category')
             if var not in cat_features:
-                logging.info(f"{var} appended to cat_features")
-                #df[var] = df[var].astype(str) #(?)
+                if verbose: logging.info(f"{var} appended to cat_features")
                 cat_features.append(var)
         else:
             if var in cat_features:
                 if df[var].isna().any():
                     df[var] = df[var].fillna(na_filler)
-                df[var] = df[var].astype('category')
+                if ensure_categorical: df[var] = df[var].astype('category')
 
 
 def prepare_df_for_xgboost(df: object, cat_features: Sequence = [], ) -> None:
@@ -47,5 +47,5 @@ def prepare_df_for_xgboost(df: object, cat_features: Sequence = [], ) -> None:
                 #df[var] = df[var].astype(str) #(?)
                 cat_features.append(var)            
         else:
-            if var in cat_features:
+            if var in cat_features and ensure_categorical:
                 df[var] = df[var].astype('category')
\ No newline at end of file
diff --git a/stats.py b/stats.py
index c7bfa01..b657a7a 100644
--- a/stats.py
+++ b/stats.py
@@ -16,7 +16,7 @@
 
 from pyutilz.pythonlib import ensure_installed  # lint: disable=ungrouped-imports,disable=wrong-import-order
 
-ensure_installed("numpy scipy")
+# ensure_installed("numpy scipy")
 
 # ----------------------------------------------------------------------------------------------------------------------------
 # Normal Imports
diff --git a/tuning.py b/tuning.py
index c26832c..85b4e8f 100644
--- a/tuning.py
+++ b/tuning.py
@@ -16,7 +16,7 @@
 
 from pyutilz.pythonlib import ensure_installed
 
-ensure_installed("pandas numpy scipy")
+# ensure_installed("pandas numpy scipy")
 
 # ----------------------------------------------------------------------------------------------------------------------------
 # Normal Imports