Skip to content

Commit

Permalink
No commit message
Browse files Browse the repository at this point in the history
  • Loading branch information
fingoldo committed Aug 30, 2023
1 parent b3745c4 commit 6e6aab5
Show file tree
Hide file tree
Showing 13 changed files with 98 additions and 23 deletions.
26 changes: 22 additions & 4 deletions Helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
# -----------------------------------------------------------------------------------------------------------------------------------------------------

from typing import * # noqa: F401 pylint: disable=wildcard-import,unused-wildcard-import
from enum import Enum
import mlflow


########################################################################################################################################################################################################################################
# Helper functions
########################################################################################################################################################################################################################################
Expand Down Expand Up @@ -115,19 +117,29 @@ def embed_website_to_mlflow(url:str,fname:str="url",extension:str='.html',width:
with open(fname+extension, "w") as f:
f.write(website_embed)

def get_or_create_mlflow_run(run_name: str, parent_run_id: str = None, experiment_name: str = None, experiment_id: str = None) -> Tuple[object, bool]:
def get_or_create_mlflow_run(run_name: str, parent_run_id: str = None, experiment_name: str = None, experiment_id: str = None,tags:dict={}) -> Tuple[object, bool]:
"""Tries to find a run by name within current mlflow experiment.
If not found, creates new one.
"""
runs = mlflow.search_runs(experiment_names=[experiment_name], filter_string=f'run_name = "{run_name}"', output_format="list")
filter_string=f'run_name = "{run_name}"'
if parent_run_id:
filter_string+=f' and tag.mlflow.parentRunId = "{parent_run_id}"'

runs = mlflow.search_runs(experiment_names=[experiment_name], filter_string=filter_string, output_format="list",)
if runs:
for run in runs:
return run, True
else:
if experiment_name:
mlflow.set_experiment(experiment_name=experiment_name)
run_tags={"mlflow.parentRunId": parent_run_id} if parent_run_id else None
if tags:
if run_tags is None:
run_tags=tags
else:
run_tags.update(tags)
run = mlflow.start_run(
run_name=run_name, experiment_id=experiment_id, tags={"mlflow.parentRunId": parent_run_id} if parent_run_id else None
run_name=run_name, experiment_id=experiment_id, tags=run_tags
) # parent_run.info.run_id
mlflow.end_run()
return run, False
Expand All @@ -136,7 +148,13 @@ def create_mlflow_run_label(params: dict, category: str = None) -> str:
label = []
for key, value in params.items():
if value:
label.append(f"{key}={value}")
if isinstance(value, Enum):
label.append(f"{key}={value.name}")
else:
if type(value) == type:
label.append(f"{key}={value.__name__}")
else:
label.append(f"{key}={value}")
label = ",".join(label)
if category:
label = f"{category}:{label}"
Expand Down
2 changes: 1 addition & 1 deletion calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from pyutilz.pythonlib import ensure_installed

ensure_installed("pandas numpy properscoring scikit-learn")
# ensure_installed("pandas numpy properscoring") # scikit-learn

# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
Expand Down
2 changes: 1 addition & 1 deletion custom_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from pyutilz.pythonlib import ensure_installed

ensure_installed("numpy pandas scikit-learn")
# ensure_installed("numpy pandas") # scikit-learn

# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
Expand Down
2 changes: 1 addition & 1 deletion eda.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from pyutilz.pythonlib import ensure_installed # lint: disable=ungrouped-imports,disable=wrong-import-order

ensure_installed("pandas")
# ensure_installed("pandas")

# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
Expand Down
2 changes: 1 addition & 1 deletion ewma.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from pyutilz.pythonlib import ensure_installed

ensure_installed("numpy")
# ensure_installed("numpy")

# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
Expand Down
2 changes: 1 addition & 1 deletion feature_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

from pyutilz.pythonlib import ensure_installed # lint: disable=ungrouped-imports,disable=wrong-import-order

ensure_installed("numpy pandas psutil")
# ensure_installed("numpy pandas psutil")

# -----------------------------------------------------------------------------------------------------------------------------------------------------
# Normal Imports
Expand Down
2 changes: 1 addition & 1 deletion feature_engineering/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from pyutilz.pythonlib import ensure_installed # lint: disable=ungrouped-imports,disable=wrong-import-order

ensure_installed("numpy pandas")
# ensure_installed("numpy pandas")

# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
Expand Down
2 changes: 1 addition & 1 deletion feature_engineering/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
ensure_installed,
) # lint: disable=ungrouped-imports,disable=wrong-import-order

ensure_installed("numpy numba sklearn antropy entropy_estimators") # npeet?
# ensure_installed("numpy numba sklearn antropy entropy_estimators") # npeet?

# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
Expand Down
2 changes: 1 addition & 1 deletion feature_engineering/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from pyutilz.pythonlib import ensure_installed

ensure_installed("numpy pandas PyWavelets")
# ensure_installed("numpy pandas") # PyWavelets

# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
Expand Down
59 changes: 58 additions & 1 deletion metrics.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
import numpy as np, pandas as pd
# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
# ----------------------------------------------------------------------------------------------------------------------------

from typing import *
from numba import njit
from math import floor
import numpy as np, pandas as pd
from matplotlib import pyplot as plt

# ----------------------------------------------------------------------------------------------------------------------------
# Core
# ----------------------------------------------------------------------------------------------------------------------------

def fast_auc(y_true: np.array, y_score: np.array) -> float:
"""np.argsort needs to stay out of njitted func."""
Expand Down Expand Up @@ -183,3 +191,52 @@ def predictions_time_instability(preds: pd.Series) -> float:
For binary classification instability ranges from 0 to 1, for regression from 0 to any value depending on the target stats.
"""
return np.abs(np.diff(preds)).mean()


# ----------------------------------------------------------------------------------------------------------------------------
# Errors & scorers
# ----------------------------------------------------------------------------------------------------------------------------


class CB_CALIB_ERROR:
def is_max_optimal(self):
return False # greater is better

def evaluate(self, approxes, target, weight):
output_weight = 1 # weight is not used

# predictions=expit(approxes[0])
predictions = 1 / (1 + np.exp(-approxes[0]))

calibration_mae, calibration_std = fast_calibration_metrics(y_true=target, y_pred=predictions)
return calibration_mae + calibration_std / 10, output_weight

def get_final_error(self, error, weight):
return error


class CB_PRECISION:
def is_max_optimal(self):
return False # greater is better

def evaluate(self, approxes, target, weight):
output_weight = 1 # weight is not used

# predictions=expit(approxes[0])
predictions = 1 / (1 + np.exp(-approxes[0]))

return fast_precision(y_true=target, y_pred=(predictions >= 0.5).astype(np.int8), zero_division=0), output_weight

def get_final_error(self, error, weight):
return error


def calib_error(labels: np.ndarray, predt: np.ndarray) -> float:
"""Calibration error."""

calibration_mae, calibration_std = fast_calibration_metrics(y_true=labels, y_pred=predt)
return calibration_mae + calibration_std / 10


def calib_error_keras(labels: np.ndarray, predt: np.ndarray) -> float:
return calib_error(labels=labels.numpy()[:, -1], predt=predt.numpy()[:, -1],)
16 changes: 8 additions & 8 deletions preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,31 @@
from pyutilz.system import tqdmu


def prepare_df_for_catboost(df: object, columns_to_drop: Sequence = [], text_features: Sequence = [], cat_features: list = [], na_filler: str = "") -> None:
def prepare_df_for_catboost(df: object, columns_to_drop: Sequence = [], text_features: Sequence = [], cat_features: list = [], na_filler: str = "",ensure_categorical:bool=True,verbose:bool=False) -> None:
"""
Catboost needs NAs replaced by a string value.
Catboost needs NAs in cat features replaced by a string value.
Possibly extends cat_features list.
ensure_categorical:bool=True makes further processing also suitable for xgboost.
"""
cols = set(df.columns)

for var in tqdmu(text_features, desc="Processing textual features for CatBoost...", leave=False):
if var in cols:
if var not in columns_to_drop:
if var in cols and var not in columns_to_drop:
if df[var].isna().any():
df[var] = df[var].fillna(na_filler)

for var in tqdmu(cols, desc="Processing categorical features for CatBoost...", leave=False):
if isinstance(df[var].dtype, pd.CategoricalDtype):
if df[var].isna().any():
df[var] = df[var].astype(str).fillna(na_filler).astype('category')
if var not in cat_features:
logging.info(f"{var} appended to cat_features")
#df[var] = df[var].astype(str) #(?)
if verbose: logging.info(f"{var} appended to cat_features")
cat_features.append(var)
else:
if var in cat_features:
if df[var].isna().any():
df[var] = df[var].fillna(na_filler)
df[var] = df[var].astype('category')
if ensure_categorical: df[var] = df[var].astype('category')


def prepare_df_for_xgboost(df: object, cat_features: Sequence = [], ) -> None:
Expand All @@ -47,5 +47,5 @@ def prepare_df_for_xgboost(df: object, cat_features: Sequence = [], ) -> None:
#df[var] = df[var].astype(str) #(?)
cat_features.append(var)
else:
if var in cat_features:
if var in cat_features and ensure_categorical:
df[var] = df[var].astype('category')
2 changes: 1 addition & 1 deletion stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from pyutilz.pythonlib import ensure_installed # lint: disable=ungrouped-imports,disable=wrong-import-order

ensure_installed("numpy scipy")
# ensure_installed("numpy scipy")

# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
Expand Down
2 changes: 1 addition & 1 deletion tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from pyutilz.pythonlib import ensure_installed

ensure_installed("pandas numpy scipy")
# ensure_installed("pandas numpy scipy")

# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
Expand Down

0 comments on commit 6e6aab5

Please sign in to comment.