diff --git a/Backtesting.py b/mlframe/Backtesting.py similarity index 100% rename from Backtesting.py rename to mlframe/Backtesting.py diff --git a/Data.py b/mlframe/Data.py similarity index 100% rename from Data.py rename to mlframe/Data.py diff --git a/FeatureEngineering.py b/mlframe/FeatureEngineering.py similarity index 100% rename from FeatureEngineering.py rename to mlframe/FeatureEngineering.py diff --git a/Features.py b/mlframe/Features.py similarity index 100% rename from Features.py rename to mlframe/Features.py diff --git a/LICENSE b/mlframe/LICENSE similarity index 100% rename from LICENSE rename to mlframe/LICENSE diff --git a/Models.py b/mlframe/Models.py similarity index 100% rename from Models.py rename to mlframe/Models.py diff --git a/OldEnsembling.py b/mlframe/OldEnsembling.py similarity index 100% rename from OldEnsembling.py rename to mlframe/OldEnsembling.py diff --git a/__init__.py b/mlframe/__init__.py similarity index 100% rename from __init__.py rename to mlframe/__init__.py diff --git a/arrays.py b/mlframe/arrays.py similarity index 100% rename from arrays.py rename to mlframe/arrays.py diff --git a/baselines.py b/mlframe/baselines.py similarity index 100% rename from baselines.py rename to mlframe/baselines.py diff --git a/boruta_shap.py b/mlframe/boruta_shap.py similarity index 100% rename from boruta_shap.py rename to mlframe/boruta_shap.py diff --git a/calibration.py b/mlframe/calibration.py similarity index 100% rename from calibration.py rename to mlframe/calibration.py diff --git a/cluster.py b/mlframe/cluster.py similarity index 100% rename from cluster.py rename to mlframe/cluster.py diff --git a/config.py b/mlframe/config.py similarity index 100% rename from config.py rename to mlframe/config.py diff --git a/core.py b/mlframe/core.py similarity index 100% rename from core.py rename to mlframe/core.py diff --git a/custom_estimators.py b/mlframe/custom_estimators.py similarity index 100% rename from custom_estimators.py rename to mlframe/custom_estimators.py diff --git a/eda.py b/mlframe/eda.py similarity index 100% rename from eda.py rename to mlframe/eda.py diff --git a/ensembling.py b/mlframe/ensembling.py similarity index 100% rename from ensembling.py rename to mlframe/ensembling.py diff --git a/estimators.py b/mlframe/estimators.py similarity index 100% rename from estimators.py rename to mlframe/estimators.py diff --git a/evaluation.py b/mlframe/evaluation.py similarity index 100% rename from evaluation.py rename to mlframe/evaluation.py diff --git a/ewma.py b/mlframe/ewma.py similarity index 100% rename from ewma.py rename to mlframe/ewma.py diff --git a/experiments.py b/mlframe/experiments.py similarity index 100% rename from experiments.py rename to mlframe/experiments.py diff --git a/explainability.py b/mlframe/explainability.py similarity index 100% rename from explainability.py rename to mlframe/explainability.py diff --git a/feature_cleaning.py b/mlframe/feature_cleaning.py similarity index 100% rename from feature_cleaning.py rename to mlframe/feature_cleaning.py diff --git a/mlframe/feature_engineering/__init__.py b/mlframe/feature_engineering/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mlframe/feature_engineering/basic.py b/mlframe/feature_engineering/basic.py new file mode 100644 index 0000000..c3b995e --- /dev/null +++ b/mlframe/feature_engineering/basic.py @@ -0,0 +1,57 @@ +"""Basic feature engineering for ML.""" + +# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement + +# ---------------------------------------------------------------------------------------------------------------------------- +# LOGGING +# ---------------------------------------------------------------------------------------------------------------------------- + +import logging + +logger = logging.getLogger(__name__) + +# ---------------------------------------------------------------------------------------------------------------------------- +# Packages +# ---------------------------------------------------------------------------------------------------------------------------- + +from pyutilz.pythonlib import ensure_installed # lint: disable=ungrouped-imports,disable=wrong-import-order + +# ensure_installed("numpy pandas") + +# ---------------------------------------------------------------------------------------------------------------------------- +# Normal Imports +# ---------------------------------------------------------------------------------------------------------------------------- + +from typing import * + +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) +import pandas as pd, numpy as np + + +def create_date_features( + df: pd.DataFrame, + cols: list, + delete_original_cols: bool = True, + bulk: bool = False, + methods: dict = {"day": np.int8, "weekday": np.int8, "month": np.int8}, # "week": np.int8, #, "quarter": np.int8 # , "year": np.int16 +) -> pd.DataFrame: + if len(cols) == 0: + return + + if bulk: + tmp = {} + else: + tmp = df + for col in cols: + for method, dtype in methods.items(): + tmp[col + "_" + method] = getattr(df[col].dt, method).astype(dtype) + + if delete_original_cols: + df.drop(columns=cols, inplace=True) + + if bulk: + df = pd.concat([df, pd.DataFrame(tmp)], axis=1) + + return df diff --git a/mlframe/feature_engineering/categorical.py b/mlframe/feature_engineering/categorical.py new file mode 100644 index 0000000..d9355e0 --- /dev/null +++ b/mlframe/feature_engineering/categorical.py @@ -0,0 +1,126 @@ +"""Categorical feature engineering for ML. Optimized & rich set of aggregates for 1d vectors.""" + +# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement + +# ---------------------------------------------------------------------------------------------------------------------------- +# LOGGING +# ---------------------------------------------------------------------------------------------------------------------------- + +import logging + +logger = logging.getLogger(__name__) + +# ---------------------------------------------------------------------------------------------------------------------------- +# Packages +# ---------------------------------------------------------------------------------------------------------------------------- + +from pyutilz.pythonlib import ( + ensure_installed, +) # lint: disable=ungrouped-imports,disable=wrong-import-order + +# ensure_installed("numpy pandas scipy") + +# ---------------------------------------------------------------------------------------------------------------------------- +# Normal Imports +# ---------------------------------------------------------------------------------------------------------------------------- + +from typing import * + +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) + +from antropy import * +import pandas as pd, numpy as np +from scipy.stats import entropy +from .numerical import compute_numaggs, get_numaggs_names + +import warnings + +warnings.filterwarnings("ignore", message="nperseg =") + +numaggs_names = get_numaggs_names() +directional_numaggs_names = get_numaggs_names(directional_only=True) + + +def compute_countaggs( + arr: pd.Series, + counts_normalize: bool = True, # use relative or absolute counts + counts_compute_numaggs: bool = True, # compute numerical aggregates over counts data or not + counts_top_n: int = 1, # return that many highest/lowest value counts + counts_return_top_counts: bool = True, # return top counts + counts_return_top_values: bool = True, # return top values + counts_compute_values_numaggs: bool = False, # if all values are in fact numerical, compute numaggs for them rather than their counts (ordered only, in order of their counts) + numerical_kwargs: dict = dict(return_unsorted_stats=False), +): + """For some variables, especially with many repeated values, or categorical, we can do value_counts(normalize=True or False). Further we can return + 1) Top N highest/lowest values along with their counts (missing are padded with NaNs) + 2) numaggs over counts data + 3) if variable is numeric, numaggs(timeseries_features=True) for values series sorted by counts (timeseries_features=True leaves only aggregates depending on the order of values, + 'cause otherwise it's simply a duplication of num_aggs over regular series) + """ + value_counts = arr.value_counts(normalize=counts_normalize) + value_counts=value_counts[value_counts > 0] + values = value_counts.index.values + counts = value_counts.values + + res = [] + + if counts_compute_numaggs: + res.extend(compute_numaggs(arr=counts, **numerical_kwargs)) + + if counts_top_n: + + if len(counts) >= counts_top_n: + extra = [] + else: + extra = [np.nan] * (counts_top_n - len(counts)) + + if counts_return_top_counts: + res.extend(counts[:counts_top_n].tolist() + extra) + res.extend(extra + counts[-counts_top_n:].tolist()) + if counts_return_top_values: + res.extend(values[:counts_top_n].tolist() + extra) + res.extend(extra + values[-counts_top_n:].tolist()) + + if counts_compute_values_numaggs: + if pd.api.types.is_numeric_dtype(values): + processed_numerical_kwargs = numerical_kwargs.copy() + processed_numerical_kwargs["directional_only"] = True + res.extend(compute_numaggs(arr=values, **processed_numerical_kwargs)) + else: + res.extend([np.nan] * len(directional_numaggs_names)) + + return res + + +def get_countaggs_names( + counts_normalize: bool = True, # use relative or absolute counts + counts_compute_numaggs: bool = True, # compute numerical aggregates over counts data or not + counts_top_n: int = 1, # return that many highest/lowest value counts + counts_return_top_counts: bool = True, # return top counts + counts_return_top_values: bool = True, # return top values + counts_compute_values_numaggs: bool = True, # if all values are in fact numerical, compute numaggs for them rather than their counts (ordered only, in order of their counts) + numerical_kwargs: dict = dict(return_unsorted_stats=False), +) -> list: + + res = [] + + if counts_compute_numaggs: + res.extend([feat + "_" + ("cntnrm" if counts_normalize else "cnt") for feat in get_numaggs_names(**numerical_kwargs)]) + + if counts_top_n: + if counts_return_top_counts: + res.extend(["top_" + str(i + 1) + "_vcnt" for i in range(counts_top_n)]) + res.extend(["btm_" + str(counts_top_n - i) + "_vcnt" for i in range(counts_top_n)]) + if counts_return_top_values: + res.extend(["top_" + str(i + 1) + "_vval" for i in range(counts_top_n)]) + res.extend(["btm_" + str(counts_top_n - i) + "_vval" for i in range(counts_top_n)]) + + if counts_compute_values_numaggs: + # if pd.api.types.is_numeric_dtype(values): + processed_numerical_kwargs = numerical_kwargs.copy() + processed_numerical_kwargs["directional_only"] = True + res.extend([feat + "_vvls" for feat in directional_numaggs_names]) + + return res diff --git a/mlframe/feature_engineering/hurst.py b/mlframe/feature_engineering/hurst.py new file mode 100644 index 0000000..ad65efd --- /dev/null +++ b/mlframe/feature_engineering/hurst.py @@ -0,0 +1,124 @@ +"""Compute the Hurst Exponent of an 1D array by the means of R/S analisys: + + https://en.wikipedia.org/wiki/Hurst_exponent +""" + +# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement + +# ---------------------------------------------------------------------------------------------------------------------------- +# LOGGING +# ---------------------------------------------------------------------------------------------------------------------------- + +import logging + +logger = logging.getLogger(__name__) + +# ---------------------------------------------------------------------------------------------------------------------------- +# Packages +# ---------------------------------------------------------------------------------------------------------------------------- + +from pyutilz.pythonlib import ( + ensure_installed, +) # lint: disable=ungrouped-imports,disable=wrong-import-order + +ensure_installed("numpy pandas numba scipy sklearn antropy") + +# ---------------------------------------------------------------------------------------------------------------------------- +# Normal Imports +# ---------------------------------------------------------------------------------------------------------------------------- + +from typing import * +import numpy as np +from numba import njit + +# ---------------------------------------------------------------------------------------------------------------------------- +# Inits +# ---------------------------------------------------------------------------------------------------------------------------- + +fastmath = False + +# ---------------------------------------------------------------------------------------------------------------------------- +# Core funcs +# ---------------------------------------------------------------------------------------------------------------------------- + + +@njit(fastmath=fastmath) +def compute_hurst_rs(arr: np.ndarray, agg_func: object = np.mean): + """Computes R/S stat for a single window.""" + + mean = agg_func(arr) + + deviations = arr - mean + Z = np.cumsum(deviations) + R = np.max(Z) - np.min(Z) + S = np.std(arr) # , ddof=1 + + if R == 0 or S == 0: + return 0.0 # to skip this interval due the undefined R/S ratio + + return R / S + + +@njit(fastmath=fastmath) +def precompute_hurst_exponent( + arr: np.ndarray, min_window: int = 5, max_window: int = None, windows_log_step: float = 0.25, take_diffs: bool = True, agg_func: object = np.mean +): + """Computes R/S stat for a single window.""" + + # Get diffs, if needed + + if take_diffs: + arr = arr[1:] - arr[:-1] + + L = len(arr) + + # Split parent array several times into a number of equal chunks, increasing the chunk length + + max_window = max_window or (L - 1) + window_sizes = (10 ** np.arange(np.log10(min_window), np.log10(max_window), windows_log_step)).astype(np.int32) + # window_sizes.append(L) + + RS = [] + used_window_sizes = [] + for w in window_sizes: + rs = [] + for start in range(0, L, w): + if (start + w) >= L: + break + partial_rs = compute_hurst_rs(arr[start : start + w]) # , agg_func=agg_func) + if partial_rs: + rs.append(partial_rs) + if rs: + RS.append(agg_func(np.array(rs))) + used_window_sizes.append(w) + + return used_window_sizes, RS + + +def compute_hurst_exponent(arr: np.ndarray, min_window: int = 5, max_window: int = None, windows_log_step: float = 0.25, take_diffs: bool = False)->tuple: + """Main enrtypoint to compute a Hurst Exponent (and the constant) of a numerical array.""" + if len(arr) < min_window: + return np.nan, np.nan + window_sizes, rs = precompute_hurst_exponent( + arr=arr, min_window=min_window, max_window=max_window, windows_log_step=windows_log_step, take_diffs=take_diffs + ) + x = np.vstack([np.log10(window_sizes), np.ones(len(rs))]).T + h, c = np.linalg.lstsq(x, np.log10(rs), rcond=-1)[0] + c = 10**c + return h, c + + +def hurst_testing(): + + # pip install hurst + + from hurst import random_walk + + brownian = random_walk(1000, proba=0.5) + print(compute_hurst_exponent(np.array(brownian))) + + persistent = random_walk(1000, proba=0.7) + print(compute_hurst_exponent(np.array(persistent))) + + antipersistent = random_walk(1000, proba=0.3) + print(compute_hurst_exponent(np.array(antipersistent))) diff --git a/mlframe/feature_engineering/numerical.py b/mlframe/feature_engineering/numerical.py new file mode 100644 index 0000000..4e61f6e --- /dev/null +++ b/mlframe/feature_engineering/numerical.py @@ -0,0 +1,1032 @@ +"""Numerical feature engineering for ML. Optimized & rich set of aggregates for 1d vectors.""" + +# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement + +# ---------------------------------------------------------------------------------------------------------------------------- +# LOGGING +# ---------------------------------------------------------------------------------------------------------------------------- + +import logging + +logger = logging.getLogger(__name__) + +# ---------------------------------------------------------------------------------------------------------------------------- +# Packages +# ---------------------------------------------------------------------------------------------------------------------------- + +from pyutilz.pythonlib import ( + ensure_installed, +) # lint: disable=ungrouped-imports,disable=wrong-import-order + +ensure_installed("numpy numba sklearn antropy astropy entropy_estimators") # npeet? + +# ---------------------------------------------------------------------------------------------------------------------------- +# Normal Imports +# ---------------------------------------------------------------------------------------------------------------------------- + +from typing import * + +import numba +import numpy as np, pandas as pd +from antropy import * +from astropy.stats import histogram +from entropy_estimators import continuous +from sklearn.feature_selection import mutual_info_regression +from mlframe.feature_engineering.hurst import compute_hurst_exponent +from pyutilz.parallel import parallel_run + +from scipy.stats import entropy, kstest +from scipy import stats + +import warnings + +warnings.filterwarnings("ignore", message="nperseg =") +warnings.simplefilter(action="ignore", category=FutureWarning) +warnings.simplefilter(action="ignore", category=RuntimeWarning) + +# ---------------------------------------------------------------------------------------------------------------------------- +# Inits +# ---------------------------------------------------------------------------------------------------------------------------- + +fastmath = False +empty_float32_array = np.array([], dtype=np.float32) + + +def cont_entropy(arr: np.ndarray, bins: str = "scott") -> float: + """Entropy of a continuous distribution""" + try: + hist, bin_edges = histogram(arr, bins=bins) # np.histogram(arr, bins=bins, density=True) + ent = -(hist * np.log(hist + 1e-60)).sum() + except Exception as e: + return np.nan + return ent + + +entropy_funcs = (cont_entropy, continuous.get_h, app_entropy, svd_entropy, sample_entropy, petrosian_fd, perm_entropy, katz_fd, detrended_fluctuation) # +entropy_funcs_names = [f.__name__ for f in entropy_funcs] + +distributions = (stats.levy_l, stats.logistic, stats.pareto) # stats.logistic, stats.pareto +default_dist_responses = dict(levy_l=[np.nan, np.nan], logistic=[np.nan, np.nan], pareto=[np.nan, np.nan, np.nan]) + +LARGE_CONST = 1e3 + + +def get_distributions_features_names() -> list: + distributions_features_names = [] + for dist in distributions: + for i in range(len(default_dist_responses[dist.name])): + distributions_features_names.append(dist.name + str(i + 1)) + distributions_features_names.append(dist.name + "_kss") + distributions_features_names.append(dist.name + "_kspval") + return distributions_features_names + + +distributions_features_names = get_distributions_features_names() + +default_quantiles: list = [0.1, 0.25, 0.5, 0.75, 0.9] # list vs ndarray gives advantage 125 µs ± 2.79 µs per loop vs 140 µs ± 8.11 µs per loop + + +@numba.njit(fastmath=fastmath) +def compute_simple_stats_numba(arr: np.ndarray) -> tuple: + minval, maxval, argmin, argmax = arr[0], arr[0], 0, 0 + size = 0 + total, std_val = 0.0, 0.0 + + for i, next_value in enumerate(arr): + if np.isfinite(next_value): + size += 1 + total += next_value + if next_value < minval: + minval = next_value + argmin = i + elif next_value > maxval: + maxval = next_value + argmax = i + + if size == 0: + size = len(arr) + + if size: + mean_value = total / size + else: + mean_value = 0.0 + minval, maxval = 0.0, 0.0 + + for i, next_value in enumerate(arr): + if np.isfinite(next_value): + d = next_value - mean_value + summand = d * d + std_val = std_val + summand + if size: + std_val = np.sqrt(std_val / size) + else: + std_val = 0.0 + return minval, maxval, argmin, argmax, mean_value, std_val + + +def get_simple_stats_names() -> list: + return "min,max,argmin,argmax,mean,std".split(",") + + +@numba.njit(fastmath=fastmath) +def compute_numerical_aggregates_numba( + arr: np.ndarray, + weights: np.ndarray = None, + geomean_log_mode: bool = False, + directional_only: bool = False, + whiten_means: bool = True, + return_drawdown_stats: bool = False, + return_profit_factor: bool = False, + return_n_zer_pos_int: bool = True, + return_exotic_means: bool = True, + return_unsorted_stats: bool = True, +) -> list: + """Compute statistical aggregates over 1d array of float32 values. + E mid2(abs(x-mid1(X))) where mid1, mid2=averages of any kind + E Функции ошибок иногда и классные признаки... + V What happens first: min or max? Add relative percentage of min/max indices + V Add absolute values of min/max indices? + V Добавить количество пересечений средних и медианного значений, линии slope? (trend reversions) + Хотя это можно в т.ч. получить, вызвав стату над нормированным или детрендированным рядом (x-X_avg) или (x-(slope*x+x[0])) + V убрать гэпы. это статистика второго порядка и должна считаться отдельно. причем можно считать от разностей или от отношений. + V взвешенные статы считать отдельным вызовом ( и не только среднеарифметические, а ВСЕ). + Добавить + V среднее кубическое, + V entropy + V hurst + V R2 + E? среднее винзоризированное (https://ru.wikipedia.org/wiki/%D0%92%D0%B8%D0%BD%D0%B7%D0%BE%D1%80%D0%B8%D0%B7%D0%BE%D0%B2%D0%B0%D0%BD%D0%BD%D0%BE%D0%B5_%D1%81%D1%80%D0%B5%D0%B4%D0%BD%D0%B5%D0%B5). + E? усечённое, + E? tukey mean + V fit variable to a number of known distributions!! their params become new features + V drawdowns, negative drawdowns (for shorts), dd duration (%) + V Number of MAX/MIN refreshers during period + V numpeaks + """ + + size = len(arr) + + first = arr[0] + last = arr[-1] + if first: + last_to_first = last / first + else: + last_to_first = LARGE_CONST * np.sign(last) + + if directional_only: + arithmetic_mean = np.mean(arr) + return [arithmetic_mean, last_to_first] + + ninteger, npositive, cnt_nonzero = 0, 0, 0 + sum_positive, sum_negative = 0.0, 0.0 + + if not geomean_log_mode: + geometric_mean = 1.0 + else: + geometric_mean = 0.0 + + arithmetic_mean, quadratic_mean, qubic_mean, harmonic_mean = 0.0, 0.0, 0.0, 0.0 + weighted_arithmetic_mean = 0.0 + if weights is not None: + weighted_geometric_mean, weighted_arithmetic_mean, weighted_quadratic_mean, weighted_qubic_mean, weighted_harmonic_mean = ( + geometric_mean, + arithmetic_mean, + quadratic_mean, + qubic_mean, + harmonic_mean, + ) + sum_weights = 0.0 + + maximum, minimum = first, first + max_index, min_index = 0, 0 + + if return_drawdown_stats: + + pos_dd_start_idx, neg_dd_start_idx = 0, 0 + + pos_dds = np.empty(shape=size, dtype=np.float32) + pos_dd_durs = np.empty(shape=size, dtype=np.float32) + + neg_dds = np.empty(shape=size, dtype=np.float32) + neg_dd_durs = np.empty(shape=size, dtype=np.float32) + + nmaxupdates, nminupdates = 0, 0 + + n_last_crossings, n_last_touches = 0, 0 + prev_d = None + + for i, next_value in enumerate(arr): + if weights is not None: + next_weight = weights[i] + sum_weights += weights[i] + if weights is not None: + weighted_arithmetic_mean += next_value * next_weight + + if return_unsorted_stats: + d = next_value - last + if prev_d is not None: + mul = d * prev_d + if mul < 0: + n_last_crossings += 1 + elif mul == 0.0: + n_last_touches += 1 + else: + if next_value == last: + n_last_touches += 1 + prev_d = d + + arithmetic_mean += next_value + + if return_exotic_means: + temp_value = next_value * next_value + quadratic_mean += temp_value + if weights is not None: + weighted_quadratic_mean += temp_value * next_weight + + temp_value = temp_value * next_value + qubic_mean += temp_value + if weights is not None: + weighted_qubic_mean += temp_value * next_weight + + if next_value > maximum: + maximum = next_value + max_index = i + nmaxupdates += 1 + elif next_value < minimum: + minimum = next_value + min_index = i + nminupdates += 1 + + # ---------------------------------------------------------------------------------------------------------------------------- + # Drawdowns + # ---------------------------------------------------------------------------------------------------------------------------- + + if return_drawdown_stats: + # pos + dd = maximum - next_value + pos_dds[i] = dd + if dd == 0.0: + pos_dd_start_idx = i + pos_dd_durs[i] = i - pos_dd_start_idx + + # neg + dd = next_value - minimum + neg_dds[i] = dd + if dd == 0.0: + neg_dd_start_idx = i + neg_dd_durs[i] = i - neg_dd_start_idx + + if next_value: + cnt_nonzero = cnt_nonzero + 1 + if return_exotic_means: + addend = 1 / next_value + harmonic_mean += addend + if weights is not None: + weighted_harmonic_mean += next_weight * addend + + if return_n_zer_pos_int: + frac = next_value % 1 + if not frac: + ninteger = ninteger + 1 + + if next_value > 0: + npositive = npositive + 1 + sum_positive += next_value + if return_exotic_means: + if not geomean_log_mode: + geometric_mean *= next_value + if weights is not None: + weighted_geometric_mean *= next_value**next_weight + if geometric_mean > 1e100 or geometric_mean < 1e-100: + # convert to log mode + geomean_log_mode = True + geometric_mean = np.log(float(geometric_mean)) + if weights is not None: + weighted_geometric_mean = np.log(float(weighted_geometric_mean)) + else: + addend = np.log(next_value) + geometric_mean += addend + if weights is not None: + weighted_geometric_mean += next_weight * addend + else: + sum_negative += next_value + + arithmetic_mean = arithmetic_mean / size + if weights is not None: + weighted_arithmetic_mean = weighted_arithmetic_mean / sum_weights + + if return_exotic_means: + quadratic_mean = np.sqrt(quadratic_mean / size) + qubic_mean = (qubic_mean / size) ** (1 / 3) + if npositive: + if not geomean_log_mode: + geometric_mean = geometric_mean ** (1 / size) + else: + geometric_mean = np.exp(geometric_mean / size) + else: + geometric_mean = np.nan + if harmonic_mean: + harmonic_mean = size / harmonic_mean + else: + harmonic_mean = np.nan + + if weights is not None: + weighted_quadratic_mean = np.sqrt(weighted_quadratic_mean / sum_weights) + weighted_qubic_mean = (weighted_qubic_mean / sum_weights) ** (1 / 3) + if npositive and sum_weights != 0.0: + if not geomean_log_mode: + weighted_geometric_mean = weighted_geometric_mean ** (1 / sum_weights) + else: + weighted_geometric_mean = np.exp(weighted_geometric_mean / sum_weights) + if weighted_geometric_mean == np.inf: + weighted_geometric_mean = 0.0 + else: + weighted_geometric_mean = np.nan + if weighted_harmonic_mean: + weighted_harmonic_mean = sum_weights / weighted_harmonic_mean + else: + weighted_harmonic_mean = np.nan + + if whiten_means: + quadratic_mean = quadratic_mean - arithmetic_mean + qubic_mean = qubic_mean - arithmetic_mean + geometric_mean = geometric_mean - arithmetic_mean + harmonic_mean = harmonic_mean - arithmetic_mean + if weights is not None: + weighted_quadratic_mean = weighted_quadratic_mean - weighted_arithmetic_mean + weighted_qubic_mean = weighted_qubic_mean - weighted_arithmetic_mean + weighted_geometric_mean = weighted_geometric_mean - weighted_arithmetic_mean + weighted_harmonic_mean = weighted_harmonic_mean - weighted_arithmetic_mean + + res = [arithmetic_mean] + if weights is not None: + res.append(weighted_arithmetic_mean) + res.extend( + ( + minimum, + maximum, + ) + ) # cant combine with the next statement as it's failing on interger inputs due to tuple dtypes mismatch + res.extend( + ( + arithmetic_mean / first if first else LARGE_CONST * np.sign(arithmetic_mean), + first / maximum if maximum else LARGE_CONST * np.sign(first), + minimum / first if first else LARGE_CONST * np.sign(minimum), + last_to_first, + ) + ) + + if return_unsorted_stats: # must be false for arrays known to be sorted + res.extend(((min_index + 1) / size if size else 0, (max_index + 1) / size if size else 0)) + res.extend((nmaxupdates, nminupdates, n_last_crossings, n_last_touches - 1)) + + if return_exotic_means: + res.extend((quadratic_mean, qubic_mean, geometric_mean, harmonic_mean)) + + if return_n_zer_pos_int: + res.extend((cnt_nonzero, npositive, ninteger)) + + if weights is not None: + if return_exotic_means: + res.extend((weighted_quadratic_mean, weighted_qubic_mean, weighted_geometric_mean, weighted_harmonic_mean)) + + if return_profit_factor: + profit_factor = sum_positive / -sum_negative if sum_negative != 0.0 else (0.0 if sum_positive == 0.0 else LARGE_CONST) + res.append(profit_factor) + + if return_drawdown_stats: + res.extend( + compute_numerical_aggregates_numba( + arr=pos_dds[1:], + weights=weights if weights is None else weights[1:], + geomean_log_mode=geomean_log_mode, + directional_only=directional_only, + whiten_means=whiten_means, + return_drawdown_stats=False, + return_profit_factor=False, + return_n_zer_pos_int=return_n_zer_pos_int, + return_exotic_means=return_exotic_means, + return_unsorted_stats=return_unsorted_stats, + ) + ) + res.extend( + compute_numerical_aggregates_numba( + arr=pos_dd_durs[1:] / (size - 1), + weights=weights if weights is None else weights[1:], + geomean_log_mode=geomean_log_mode, + directional_only=directional_only, + whiten_means=whiten_means, + return_drawdown_stats=False, + return_profit_factor=False, + return_n_zer_pos_int=return_n_zer_pos_int, + return_exotic_means=return_exotic_means, + return_unsorted_stats=return_unsorted_stats, + ) + ) + res.extend( + compute_numerical_aggregates_numba( + arr=neg_dds[1:], + weights=weights if weights is None else weights[1:], + geomean_log_mode=geomean_log_mode, + directional_only=directional_only, + whiten_means=whiten_means, + return_drawdown_stats=False, + return_profit_factor=False, + return_n_zer_pos_int=return_n_zer_pos_int, + return_exotic_means=return_exotic_means, + return_unsorted_stats=return_unsorted_stats, + ) + ) + res.extend( + compute_numerical_aggregates_numba( + arr=neg_dd_durs[1:] / (size - 1), + weights=weights if weights is None else weights[1:], + geomean_log_mode=geomean_log_mode, + directional_only=directional_only, + whiten_means=whiten_means, + return_drawdown_stats=False, + return_profit_factor=False, + return_n_zer_pos_int=return_n_zer_pos_int, + return_exotic_means=return_exotic_means, + return_unsorted_stats=return_unsorted_stats, + ) + ) + + return res + + +def get_basic_feature_names( + weights: np.ndarray = None, + whiten_means: bool = True, + return_drawdown_stats: bool = False, + return_profit_factor: bool = False, + return_n_zer_pos_int: bool = True, + return_exotic_means: bool = True, + return_unsorted_stats: bool = True, +): + basic_fields = ["arimean"] + if weights is not None: + basic_fields.append("warimean") + basic_fields.extend("min,max,arimean_to_first,first_to_max,min_to_first,last_to_first".split(",")) + + if return_unsorted_stats: # must be false for arrays known to be sorted + basic_fields.append("minr") + basic_fields.append("maxr") + basic_fields.append("nmaxupdates") + basic_fields.append("nminupdates") + basic_fields.append("lastcross") + basic_fields.append("lasttouch") + + if return_exotic_means: + basic_fields.extend(("quadmean,qubmean,geomean,harmmean" if not whiten_means else "quadmeanw,qubmeanw,geomeanw,harmmeanw").split(",")) + + if return_n_zer_pos_int: + basic_fields.append("nonzero") + basic_fields.append("npos") + basic_fields.append("nint") + + if weights is not None: + if return_exotic_means: + basic_fields.extend(("wquadmean,wqubmean,wgeomean,wharmmean" if not whiten_means else "wquadmeanw,wqubmeanw,wgeomeanw,wharmmeanw").split(",")) + + res = basic_fields.copy() + + if return_profit_factor: + res.append("profit_factor") + + if return_drawdown_stats: + for var in "pos_dd pos_dd_dur neg_dd neg_dd_dur".split(): + res.extend([var + "_" + field for field in basic_fields]) + + return res + + +def compute_nunique_modes_quantiles_numpy( + arr: np.ndarray, q: list = default_quantiles, quantile_method: str = "median_unbiased", max_modes: int = 10, return_unsorted_stats: bool = True +) -> list: + """For a 1d array, computes aggregates: + nunique + modes:min,max,mean + list of quantiles (0 and 1 included by default, therefore, min/max) + number of quantiles crossings + Can NOT be numba jitted (yet). + """ + if return_unsorted_stats: + vals, counts = np.unique(arr, return_counts=True) + + max_modes = min(max_modes, len(counts)) + + modes_indices = np.argpartition(counts, -max_modes)[-max_modes:] + modes_indices = modes_indices[np.argsort(counts[modes_indices])][::-1] + + first_mode_count = counts[modes_indices[0]] + + if first_mode_count == 1: + modes_min, modes_max, modes_mean, modes_qty = ( + np.nan, + np.nan, + np.nan, + np.nan, + ) # for higher stability. cnt=1 is not really a mode, rather a random pick. + else: + next_idx = modes_indices[0] + best_modes = [vals[next_idx]] + for i in range(1, max_modes): + next_idx = modes_indices[i] + next_mode_count = counts[next_idx] + if next_mode_count < first_mode_count: + break + else: + best_modes.append(vals[next_idx]) + best_modes = np.asarray(best_modes) + modes_min = best_modes.min() + modes_max = best_modes.max() + modes_mean = best_modes.mean() + modes_qty = len(best_modes) + + nuniques = len(vals) + + res = (nuniques, modes_min, modes_max, modes_mean, modes_qty) + else: + res = () + + quantiles = np.quantile(arr, q, method=quantile_method) + res = res + tuple(quantiles) # .tolist() + + if return_unsorted_stats: + res = res + tuple(compute_ncrossings(arr=arr, marks=quantiles)) # .tolist() + + return res + + +def compute_ncrossings(arr: np.ndarray, marks: np.ndarray, dtype=np.int32) -> np.ndarray: + n_crossings = np.zeros(len(marks), dtype=dtype) + prev_ds = np.full(len(marks), dtype=np.float32, fill_value=np.nan) + + for next_value in arr: + for i, mark in enumerate(marks): + d = next_value - mark + if prev_ds[i] is not None: + if d * prev_ds[i] < 0: + n_crossings[i] += 1 + prev_ds[i] = d + + return n_crossings + + +@numba.njit(fastmath=fastmath) +def compute_nunique_mode_quantiles_numba(arr: np.ndarray, q: list = default_quantiles) -> tuple: + """ + NOT RECOMMENDED. use compute_nunique_modes_quantiles_numpy instead, it's faster and more functional. + numUnique and mode calculation from sorted array + CAN be numba jitted. + """ + xsorted = np.sort(arr) + + next_unique_value = xsorted[0] + numUnique = 1 + mode = next_unique_value + best_count = 1 + times_occured = 0 + for next_value in xsorted: + if next_value == next_unique_value: + times_occured = times_occured + 1 + else: + numUnique = numUnique + 1 + if times_occured > best_count: + best_count = times_occured + mode = next_unique_value + next_unique_value = next_value + times_occured = 1 + if times_occured > best_count: + best_count = times_occured + mode = next_unique_value + + factor = len(arr) + quantiles = [] + for q in q: + quantiles.append(xsorted[int(np.ceil(q * factor)) - 1]) + + # if size % 2 == 0: + # sent = int(size / 2) + # median = xsorted[sent - 1] + xsorted[sent] + # else: + # median = xsorted[int(size // 2)] + + if times_occured == 1: + mode = np.nan + return numUnique, mode, quantiles + + +@numba.njit(fastmath=fastmath) +def compute_moments_slope_mi( + arr: np.ndarray, + mean_value: float, + weights: np.ndarray = None, + weighted_mean_value: float = None, + xvals: np.ndarray = None, # empty_float32_array, + directional_only: bool = False, + return_lintrend_approx_stats: bool = True, +) -> list: + """Добавить: + ? RANSAC-регрессию или что-то подобное, устойчивое к выбросам? + V Количество пересечений не просто среднего, а ещё и квантилей. Для финансовых приложений это мб полезно, тк есть гипотеза, + что если цена тестирует уровень много раз в течение дня, она его в итоге пробъёт (https://youtu.be/2DrBc35VLvE?t=129). + ? Можно для начала добавить отношение крайних квантилей к макс/мин. + """ + slope_over, slope_under = 0.0, 0.0 + mad, std, skew, kurt = 0.0, 0.0, 0.0, 0.0 + if weights is not None: + sum_weights = 0.0 + weighted_mad, weighted_std, weighted_skew, weighted_kurt = mad, std, skew, kurt + + size = len(arr) + + if xvals is None: # len(xvals)==0: + xvals = np.arange(size, dtype=np.float32) + # xvals_mean = 1 / 2 * (2 * 0 + (size - 1) * 1) + xvals_mean = np.mean(xvals) + + n_mean_crossings = 0.0 + prev_d = None + + r_sum = 0.0 + + for i, next_value in enumerate(arr): + + sl_x = xvals[i] - xvals_mean + + slope_over += sl_x * next_value + slope_under += sl_x**2 + + d = next_value - mean_value + r_sum += sl_x * d + + if prev_d is not None: + if d * prev_d < 0: + n_mean_crossings += 1 + prev_d = d + + mad += abs(d) + if weights is not None: + next_weight = weights[i] + w_d = next_value - weighted_mean_value + sum_weights += next_weight + weighted_mad += abs(w_d) * next_weight + + summand = d * d + std += summand + if weights is not None: + w_summand = w_d * w_d + weighted_std += w_summand * next_weight + + if not directional_only: + + summand = summand * d + skew += summand + if weights is not None: + w_summand = w_summand * w_d + weighted_skew += w_summand * next_weight + + kurt += summand * d + if weights is not None: + weighted_skew += w_summand * w_d * next_weight + + # mi = mutual_info_regression(xvals.reshape(-1, 1), arr, n_neighbors=2) # n_neighbors=2 is strictly needed for short sequences + + std = np.sqrt(std / size) + if weights is not None: + weighted_std = np.sqrt(weighted_std / sum_weights) + + if not directional_only: + mad = mad / size + + if std == 0: + skew, kurt = 0.0, 0.0 + else: + factor = size * std**3 + if factor: + skew = skew / factor + + factor = factor * std + kurt = kurt / factor - 3.0 + + if weights is not None: + weighted_mad = weighted_mad / sum_weights + if weighted_std == 0: + weighted_skew, weighted_kurt = 0.0, 0.0 + else: + factor = size * weighted_std**3 + if factor: + weighted_skew = weighted_skew / factor + + factor = factor * weighted_std + weighted_kurt = weighted_kurt / factor - 3.0 + + if np.isclose(slope_under, 0) or np.isnan(slope_under): + r = 0.0 + slope = np.nan + intercept = np.nan + n_lintrend_crossings = np.nan + else: + slope = slope_over / slope_under + + # R-value + if np.isclose(std, 0): + r = 0 + else: + r = r_sum / (np.sqrt(slope_under) * std * np.sqrt(size)) + # Test for numerical error propagation (make sure -1 < r < 1) + if r > 1.0: + r = 1.0 + elif r < -1.0: + r = -1.0 + + # slope crossings & trend approximation errors + + prev_d = None + intercept = mean_value - slope * xvals_mean + n_lintrend_crossings = 0.0 + + if return_lintrend_approx_stats: + lintrend_data_diffs = np.empty_like(arr) + else: + lintrend_data_diffs = None + + for i, next_value in enumerate(arr): + d = next_value - (slope * xvals[i] + intercept) + if prev_d is not None: + if d * prev_d < 0: + n_lintrend_crossings += 1 + prev_d = d + if return_lintrend_approx_stats: + lintrend_data_diffs[i] = d + + res = [] + if not directional_only: + res.extend((mad, std, skew, kurt)) + if weights is not None: + res.extend((weighted_mad, weighted_std, weighted_skew, weighted_kurt)) + res.extend((slope, intercept, r, n_mean_crossings, n_lintrend_crossings)) + return res, lintrend_data_diffs + + +def compute_mutual_info_regression(arr: np.ndarray, xvals: np.ndarray = np.array([], dtype=np.float32)) -> float: + if len(xvals): + mi = mutual_info_regression(xvals.reshape(-1, 1), arr, n_neighbors=2) + else: + mi = mutual_info_regression(np.arange(len(arr)).reshape(-1, 1), arr, n_neighbors=2) + + return mi[0] + + +def compute_entropy_features(arr: np.ndarray, sampling_frequency: int = 100, spectral_method: str = "welch") -> list: + # hjorth_mobility, hjorth_complexity = hjorth_params(arr) + # hjorth_mobility, + # hjorth_complexity, + # spectral_entropy(arr, sf=sampling_frequency, method=spectral_method),) + # num_zerocross(arr), + + nonzero = (~np.isnan(arr)).sum() + if nonzero < 10: + return (0.0,) * len(entropy_funcs) + else: + # safe_arr = arr[~np.isnan(arr)] + return tuple(np.nan_to_num([f(arr) for f in entropy_funcs], posinf=0, neginf=0)) + + +def fit_distribution(dist: object, data: np.ndarray, method: str = "mle"): + try: + params = dist.fit(data, method=method) + except Exception as e: + return default_dist_responses[dist.name] + [np.nan, np.nan] + else: + dist_fitted = dist(*params) + ks_stat, ks_pval = kstest(data, dist_fitted.cdf) + + return tuple(np.nan_to_num([*params, ks_stat, ks_pval], posinf=0, neginf=0)) + + +def compute_distributional_features(arr: np.ndarray) -> tuple: + res = tuple() + for dist in distributions: + res = res + fit_distribution(dist=dist, data=arr) + return res + + +def compute_numaggs( + arr: np.ndarray, + xvals: np.ndarray = None, + weights: np.ndarray = None, + geomean_log_mode: bool = False, + q: list = default_quantiles, + quantile_method: str = "median_unbiased", + max_modes: int = 10, + sampling_frequency: int = 100, + spectral_method: str = "welch", + hurst_kwargs: dict = dict(min_window=10, max_window=None, windows_log_step=0.25, take_diffs=False), + directional_only: bool = False, + whiten_means: bool = True, + return_distributional: bool = False, + return_entropy: bool = True, + return_hurst: bool = True, + return_float32: bool = True, + return_profit_factor: bool = False, + return_drawdown_stats: bool = False, + return_n_zer_pos_int: bool = True, + return_exotic_means: bool = True, + return_unsorted_stats: bool = True, + return_lintrend_approx_stats: bool = False, +): + """Compute a plethora of numerical aggregates for all values in an array. + Converts an arbitrarily length array into fixed number of aggregates. + """ + if len(arr) == 0: + return (np.nan,) * len( + get_numaggs_names( + q=q, + directional_only=directional_only, + whiten_means=whiten_means, + return_distributional=return_distributional, + return_entropy=return_entropy, + return_hurst=return_hurst, + return_profit_factor=return_profit_factor, + return_drawdown_stats=return_drawdown_stats, + return_n_zer_pos_int=return_n_zer_pos_int, + return_exotic_means=return_exotic_means, + return_unsorted_stats=return_unsorted_stats, + ) + ) + + res = compute_numerical_aggregates_numba( + arr, + weights=weights, + geomean_log_mode=geomean_log_mode, + directional_only=directional_only, + whiten_means=whiten_means, + return_profit_factor=return_profit_factor, + return_drawdown_stats=return_drawdown_stats, + return_n_zer_pos_int=return_n_zer_pos_int, + return_exotic_means=return_exotic_means, + return_unsorted_stats=return_unsorted_stats, + ) + + arithmetic_mean = res[0] + if weights is not None: + weighted_arithmetic_mean = res[1] + else: + weighted_arithmetic_mean = 0.0 + + res = tuple(res) + if not directional_only: + res = res + compute_nunique_modes_quantiles_numpy( + arr=arr, q=q, quantile_method=quantile_method, max_modes=max_modes, return_unsorted_stats=return_unsorted_stats + ) + + moments_slope_mi, lintrend_data_diffs = compute_moments_slope_mi( + arr=arr, + weights=weights, + mean_value=arithmetic_mean, + weighted_mean_value=weighted_arithmetic_mean, + xvals=xvals, + directional_only=directional_only, + return_lintrend_approx_stats=return_lintrend_approx_stats, + ) + res = res + tuple(moments_slope_mi) + + if return_hurst: + res = res + compute_hurst_exponent(arr=arr, **hurst_kwargs) + + if not (directional_only or not return_entropy): + res = res + compute_entropy_features(arr=arr, sampling_frequency=sampling_frequency, spectral_method=spectral_method) + if return_distributional: + res = res + tuple(compute_distributional_features(arr=arr)) + if return_lintrend_approx_stats: + params = dict( + weights=weights, + geomean_log_mode=geomean_log_mode, + q=q, + quantile_method=quantile_method, + max_modes=max_modes, + sampling_frequency=sampling_frequency, + spectral_method=spectral_method, + hurst_kwargs=hurst_kwargs, + directional_only=directional_only, + whiten_means=whiten_means, + return_distributional=return_distributional, + return_entropy=return_entropy, + return_hurst=return_hurst, + return_float32=False, + return_profit_factor=False, + return_drawdown_stats=False, + return_n_zer_pos_int=return_n_zer_pos_int, + return_exotic_means=return_exotic_means, + return_unsorted_stats=return_unsorted_stats, + return_lintrend_approx_stats=False, + ) + res = res + tuple(compute_numaggs(arr=lintrend_data_diffs, xvals=xvals, **params)) + + if return_float32: + return np.array(res, dtype=np.float32) + else: + return res + + +def get_numaggs_names( + weights: np.ndarray = None, + q: list = default_quantiles, + directional_only: bool = False, + whiten_means: bool = True, + return_distributional: bool = False, + return_entropy: bool = True, + return_hurst: bool = True, + return_profit_factor: bool = False, + return_drawdown_stats: bool = False, + return_n_zer_pos_int: bool = True, + return_exotic_means: bool = True, + return_unsorted_stats: bool = True, + return_lintrend_approx_stats: bool = False, + **kwargs +) -> tuple: + res = tuple( + ( + ["arimean", "ratio"] + if directional_only + else get_basic_feature_names( + weights=weights, + whiten_means=whiten_means, + return_profit_factor=return_profit_factor, + return_drawdown_stats=return_drawdown_stats, + return_n_zer_pos_int=return_n_zer_pos_int, + return_exotic_means=return_exotic_means, + return_unsorted_stats=return_unsorted_stats, + ) + ) + + ([] if (directional_only or not return_unsorted_stats) else "nuniques,modmin,modmax,modmean,modqty".split(",")) + + ([] if directional_only else (["q" + str(q) for q in q])) + + ([] if not return_unsorted_stats else ["ncrs" + str(q) for q in q]) + + get_moments_slope_mi_feature_names(weights=weights, directional_only=directional_only, return_lintrend_approx_stats=return_lintrend_approx_stats) + # + ["mutual_info_regression",] + + (["hursth", "hurstc"] if return_hurst else []) + + ([] if (directional_only or not return_entropy) else entropy_funcs_names) + + (distributions_features_names if return_distributional else []) + ) + + if return_lintrend_approx_stats: + params = dict( + weights=weights, + q=q, + directional_only=directional_only, + whiten_means=whiten_means, + return_distributional=return_distributional, + return_entropy=return_entropy, + return_hurst=return_hurst, + return_float32=False, + return_profit_factor=False, + return_drawdown_stats=False, + return_n_zer_pos_int=return_n_zer_pos_int, + return_exotic_means=return_exotic_means, + return_unsorted_stats=return_unsorted_stats, + return_lintrend_approx_stats=False, + ) + res += tuple(["ltrappr_" + el for el in get_numaggs_names(**params)]) + return res + + +def get_moments_slope_mi_feature_names(weights: np.ndarray = None, directional_only: bool = False, return_lintrend_approx_stats: bool = True): + res = [] + if not directional_only: + res.extend("mad,std,skew,kurt".split(",")) + if weights is not None: + res.extend("wmad,wstd,wskew,wkurt".split(",")) + res.extend("slope,intercept,r,meancross,trendcross".split(",")) + return res + + +def numaggs_over_matrix(vals: np.ndarray, numagg_params: dict, dtype=np.float32) -> np.ndarray: + """Computes numaggs on a 2d matrix""" + feature_names = get_numaggs_names(**numagg_params) + + res = np.empty(shape=(len(vals), len(feature_names)), dtype=dtype) + + for i in range(vals.shape[0]): + res[i, :] = compute_numaggs(vals[i, :], **numagg_params) + return res + + +def numaggs_over_df_columns_parallel( + df: pd.DataFrame, cols: Sequence, numagg_params: dict, dtype=np.float32, n_jobs=-1, prefetch_factor: int = 2, **parallel_kwargs +) -> np.ndarray: + """Computes numaggs over columns of a dataframe, in parallel fashion. + Example of parallel_kwargs: numaggs_over_df_columns_parallel(df=X, cols=cols, temp_folder=r'R:\Temp') + """ + + res = parallel_run( + [delayed(numaggs_over_matrix)(vals=chunk, numagg_params=numagg_params) for chunk in np.array_split(df.loc[:, cols].values, n_jobs * prefetch_factor)], + max_nbytes=0, + n_jobs=n_jobs, + **parallel_kwargs + ) + res = np.vstack(res) + + return res diff --git a/mlframe/feature_engineering/timeseries.py b/mlframe/feature_engineering/timeseries.py new file mode 100644 index 0000000..4498aa5 --- /dev/null +++ b/mlframe/feature_engineering/timeseries.py @@ -0,0 +1,821 @@ +# ---------------------------------------------------------------------------------------------------------------------------- +# LOGGING +# ---------------------------------------------------------------------------------------------------------------------------- + +import logging + +logger = logging.getLogger(__name__) + +# ---------------------------------------------------------------------------------------------------------------------------- +# Packages +# ---------------------------------------------------------------------------------------------------------------------------- + +from pyutilz.pythonlib import ensure_installed + +# ensure_installed("numpy pandas") # PyWavelets + +# ---------------------------------------------------------------------------------------------------------------------------- +# Normal Imports +# ---------------------------------------------------------------------------------------------------------------------------- + +from typing import * + +import psutil +from numba import njit +from mlframe.ewma import ewma + +from mlframe.feature_engineering.numerical import compute_numaggs, get_numaggs_names +from mlframe.feature_engineering.categorical import compute_countaggs, get_countaggs_names + +from pyutilz.numpylib import smart_ratios + +import pandas as pd, numpy as np + +from pyutilz.system import tqdmu +from pyutilz.strings import slugify +from pyutilz.pythonlib import get_human_readable_set_size +from pyutilz.parallel import applyfunc_parallel +from functools import partial + +from scipy.signal import welch +from scipy.fftpack import fft +from scipy import signal +import pywt + +cCOMPACT_WAVELETS = True + +# ----------------------------------------------------------------------------------------------------------------------------------------------------- +# Inits +# ----------------------------------------------------------------------------------------------------------------------------------------------------- + + +def get_numaggs_metadata(numaggs_kwds: dict = {},numaggs_names:list=[]): + + if not numaggs_names: + numaggs_names = list(get_numaggs_names(**numaggs_kwds)) + + try: + q1_idx = numaggs_names.index("q0.25") + q3_idx = numaggs_names.index("q0.75") + except ValueError as e: + q1_idx, q3_idx = None, None + + return numaggs_names, q1_idx, q3_idx + + +default_countaggs_names = list(get_countaggs_names()) +default_numaggs_names, default_q1_idx, default_q3_idx = get_numaggs_metadata() + +# ----------------------------------------------------------------------------------------------------------------------------------------------------- +# Helpers +# ----------------------------------------------------------------------------------------------------------------------------------------------------- + + +@njit() +def find_next_cumsum_left_index(window_var_values: np.ndarray, amount: float, right_index: int = None, min_samples: int = 1,use_abs:bool=False) -> tuple: + """Calculating windows having required turnovers.""" + total = 0.0 + if right_index <= 0: + return 0, total + + if right_index is None: + right_index = len(window_var_values) + for i in range(1, right_index): + if not np.isnan(window_var_values[right_index - i]): + total += window_var_values[right_index - i] + if use_abs: + if np.abs(total) >= amount and i >= min_samples: + return right_index - i, total + else: + if total >= amount and i >= min_samples: + return right_index - i, total + + return 0, total + +@njit() +def find_next_cumsum_right_index(window_var_values: np.ndarray, amount: float, left_index: int = None, min_samples: int = 1,use_abs:bool=False) -> tuple: + """Calculating windows having required turnovers.""" + total = 0.0 + l = len(window_var_values) + if left_index >= l - 1: + return l - 1, total + + if left_index is None: + left_index = 0 + for i in range(1, (l - left_index)): + if not np.isnan(window_var_values[left_index + i]): + total += window_var_values[left_index + i] + if use_abs: + if np.abs(total) >= amount and i >= min_samples: + return left_index + i, total + else: + if total >= amount and i >= min_samples: + return left_index + i, total + + return l - 1, total + + +def get_nwindows_expected(windows: dict) -> int: + """How many windows are expected from the entire set?""" + r = 0 + for window_var, windows_lengths in windows.items(): + r += len(windows_lengths) + return r + + +def get_ts_window_name(window_var: str, window_size: float, window_index_name: str = "") -> str: + """Give a timeseries window a human readable short name.""" + + if window_var == "": # just index + dataset_name = str(window_size) + window_index_name + else: + dataset_name = window_var + ":" + get_human_readable_set_size(window_size) + + return dataset_name + + +# ----------------------------------------------------------------------------------------------------------------------------------------------------- +# Windows and ML features over them +# ----------------------------------------------------------------------------------------------------------------------------------------------------- + + +def create_aggregated_features( + window_df: pd.DataFrame, + row_features: list, + create_features_names: bool, + features_names: list, + dataset_name: str, + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + # common settings + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + vars_mask_regexp: object = None, + vars_mask_exclude_regexp: object = None, + captions_vars_sep: str = "-", + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + # numericals + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + differences_features: bool = False, + ratios_features: bool = False, + robust_features: bool = False, + weighting_vars: Sequence = (), + na_fills: dict = {"": 1e3}, + span_corrections: dict = {"": 1e2}, + ewma_alphas: Sequence = (), + rolling: Sequence = (), # method_params can also include engine="numba", engine_kwargs={"parallel": True} + nonlinear_transforms=[np.cbrt], + nonnormal_vars: Sequence = (), + waveletnames="", #"rbio3.1", + numaggs_kwds: dict = {}, + splitting_vars:dict={}, + drawdown_vars:dict={}, + groupby_vars:dict={}, #{'ticker':['Volume']}=deals.groupby("ticker").Volume.agg("sum").values / deals.Volume.sum() + return_n_finite:bool=False, + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + # categoricals + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + process_categoricals: bool = False, # categoricals will be processed as counts data + counts_processing_mask_regexp: object = None, # separate variables can be processed as counts as well + countaggs_kwds: dict = {}, + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + # subsets + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + subsets: dict = {}, + checked_subsets: list = [], + subset_token: str = "_", + nested_subsets: bool = False, +): + """ + Each suitable variable of a dataframe gets numerical aggregates computed over a number of transformations over the set of its raw values + 1) as is: raw_vals + 2) ratios: div0(raw_vals[1:], raw_vals[:-1], fill=0.0) **ordered feature + 3) wavelets of raw_vals + 4) raw_vals weighted by second var, if the main var is not related to second var (has no second var in its name) + 5) exponentially weighted row_wals, for example with alphas=[0.6]: ewma(prices.Price.values, 0.6) **ordered feature + 5.1) rolling with optional scipy windows: rolling=[(dict(window=2,win_type='parzen'),'mean',dict(sym=False))]; can also include engine="numba", engine_kwargs={"parallel": True} + 6) log, or cubic root, or some other non-linear transform (yeo-johnson) of raw_vals (benefitial for non-normally distributed vars) + 7) robust subset of raw_vals, ie, within 0.1 and 0.9 quantiles + 8) for some variables, especially with many repeated values, or categorical, we can do value_counts(normalize=True or False). Further we can return + 1) Top N highest/lowest values along with their counts (missing are padded with NaNs) + 2) numaggs over counts data + 3) if variable is numeric, numaggs(timeseries_features=True) for values series sorted by counts (timeseries_features=True leaves only aggregates depending on the order of values) + 9*) possibly, ratios of numfeatures over raw values of a smaller window compared to bigger windows + 10*) lags: closest same day of month, week, year. at least for Price! pd.offsets.DateOffset(months=1) + """ + assert len(window_df)>1 + + if numaggs_kwds: + numaggs_names, q1_idx, q3_idx = get_numaggs_metadata(numaggs_kwds) + else: + numaggs_names, q1_idx, q3_idx = default_numaggs_names, default_q1_idx, default_q3_idx + + if not countaggs_kwds: + countaggs_names = default_countaggs_names + else: + countaggs_names = list(get_countaggs_names(**countaggs_kwds)) + + for var in window_df.columns: + if var in subsets or var in checked_subsets: continue + if (vars_mask_regexp is None or vars_mask_regexp.search(var)) and (vars_mask_exclude_regexp is None or not vars_mask_exclude_regexp.search(var)): + + if var in groupby_vars: + groupby=window_df.groupby(var,observed=True) #TODO! investigate: observed=True leads to missing blocks?? + for sum_var in groupby_vars[var]: + if sum_var in window_df: + total=window_df[sum_var].sum() + raw_vals=groupby[sum_var].agg("sum").values + if total: raw_vals=raw_vals/ total + row_features.extend(compute_numaggs(raw_vals, **numaggs_kwds)) + if create_features_names: + features_names.extend((captions_vars_sep.join((dataset_name, sum_var, "grpby", var, feat)) for feat in numaggs_names)) + + # is this categorical? + if window_df[var].dtype.name in ( + "category", + ): # we do not list "object", "str" to exclude pure textual columns. They can be added explicilty though. + if process_categoricals or (counts_processing_mask_regexp and counts_processing_mask_regexp.search(var)): + row_features.extend(compute_countaggs(window_df[var], **countaggs_kwds)) + if create_features_names: + features_names.extend((captions_vars_sep.join((dataset_name, var, "vlscnt", feat)) for feat in countaggs_names)) + else: + if not (window_df[var].dtype.name in ("object", "str")): + if "datetime" in window_df[var].dtype.name: + raw_vals = window_df[var].diff(1).dt.total_seconds().values + else: + raw_vals = window_df[var].values + + # safe values without nans and infs + + idx = np.isfinite(raw_vals) + raw_vals = raw_vals[idx] + + if return_n_finite: + row_features.append(idx.sum()) + if create_features_names: + features_names.append(captions_vars_sep.join((dataset_name, var, "n_finite"))) + + # 1) as is: numaggs of raw_vals + + if var in drawdown_vars: + custom_numaggs_kwds=numaggs_kwds.copy() + custom_numaggs_kwds['return_drawdown_stats']=True + simple_numaggs_names=get_numaggs_names(**custom_numaggs_kwds) + else: + custom_numaggs_kwds=numaggs_kwds + simple_numaggs_names=numaggs_names + + simple_numerical_features = compute_numaggs(raw_vals, **custom_numaggs_kwds) + row_features.extend(simple_numerical_features) + if create_features_names: + features_names.extend((captions_vars_sep.join((dataset_name, var, feat)) for feat in simple_numaggs_names)) + + if splitting_vars and (custom_numaggs_kwds.get('return_unsorted_stats')!=False): + if splitting_vars and var in splitting_vars: + compute_splitting_stats(window_df=window_df,dataset_name=dataset_name,splitting_vars=splitting_vars,var=var,numaggs_names=simple_numaggs_names,numaggs_values=simple_numerical_features, + row_features=row_features,features_names=features_names,create_features_names=create_features_names,captions_vars_sep=captions_vars_sep) + + if differences_features: + differences = np.diff(raw_vals, 1) + + custom_numaggs_kwds=numaggs_kwds.copy() + custom_numaggs_kwds['return_profit_factor']=True + row_features.extend(compute_numaggs(differences, **custom_numaggs_kwds)) + if create_features_names: + custom_numaggs_names=list(get_numaggs_names(**custom_numaggs_kwds)) + features_names.extend((captions_vars_sep.join((dataset_name, var, "dif", feat)) for feat in custom_numaggs_names)) + + if ratios_features: + # 2) ratios: div0(raw_vals[1:], raw_vals[:-1], fill=0.0) + #print(var) + ratios = smart_ratios( + raw_vals[1:], + raw_vals[:-1], + span_correction=span_corrections.get(var, span_corrections.get("", 1e2)), + na_fill=na_fills.get(var, na_fills.get("")), + ) + + custom_numaggs_kwds=numaggs_kwds.copy() + custom_numaggs_kwds['return_profit_factor']=True + row_features.extend(compute_numaggs(ratios, **custom_numaggs_kwds)) + if create_features_names: + custom_numaggs_names=list(get_numaggs_names(**custom_numaggs_kwds)) + features_names.extend((captions_vars_sep.join((dataset_name, var, "rat", feat)) for feat in custom_numaggs_names)) + + # 3) wavelets of raw_vals + if waveletnames: + custom_numaggs_kwds=numaggs_kwds.copy() + custom_numaggs_kwds['return_hurst']=False + custom_numaggs_kwds['return_entropy']=False #they all are constant anyways () + for waveletname in waveletnames: + all_coeffs=[] + for i, coeffs in enumerate(pywt.wavedec(raw_vals, waveletname)): + all_coeffs.append(coeffs) + all_coeffs=np.hstack(all_coeffs) + row_features.extend(compute_numaggs(all_coeffs, **custom_numaggs_kwds)) + if create_features_names: + custom_numaggs_names=list(get_numaggs_names(**custom_numaggs_kwds)) + features_names.extend((captions_vars_sep.join((dataset_name, var, waveletname, feat)) for feat in custom_numaggs_names)) + + # 4) raw_vals weighted by second var, if the main var is not related to second var (has no second var in its name) + for weighting_var in weighting_vars: + if weighting_var not in var and weighting_var in window_df: + weighting_values = window_df.loc[idx, weighting_var].values + row_features.extend(compute_numaggs((raw_vals / weighting_values.sum()) * weighting_values, **numaggs_kwds)) + if create_features_names: + features_names.extend((captions_vars_sep.join((dataset_name, var, "wgt", weighting_var, feat)) for feat in numaggs_names)) + + # 5) exponentially weighted raw_vals with some alphas, like [0.6, 0.9]: + for alpha in ewma_alphas: + row_features.extend(compute_numaggs(ewma(raw_vals, alpha), **numaggs_kwds)) + if create_features_names: + features_names.extend((captions_vars_sep.join((dataset_name, var, "ewma", str(alpha), feat)) for feat in numaggs_names)) + + # 5.1) rolling + for window, method, method_params in rolling: + if "datetime" in window_df[var].dtype.name: + #logger.warning(f"Cant't use rolling features for var {var} 'cause of datetime-like dtype.") + break + vals = getattr(window_df[var].rolling(**window), method)(**method_params).values + safe_idx = np.isfinite(vals) + vals = vals[safe_idx] + row_features.extend(compute_numaggs(vals, **numaggs_kwds)) + if create_features_names: + specs = ";".join((f"{key}={value}" for key,value in dict(**window, m=method, **method_params).items())) # comma , refused by lgbm: lightgbm.basic.LightGBMError: Do not support special JSON characters in feature name. + specs=specs.replace("win_type","t").replace("window","w") + features_names.extend((captions_vars_sep.join((dataset_name, var, "rol", specs, feat)) for feat in numaggs_names)) + + # 6) log, or cubic root, or some other non-linear transform (yeo-johnson) of raw_vals + if var in nonnormal_vars: + for nonlinear_func in nonlinear_transforms: + transform_name = nonlinear_func.__name__ + row_features.extend(compute_numaggs(nonlinear_func(raw_vals), **numaggs_kwds)) + if create_features_names: + features_names.extend((captions_vars_sep.join((dataset_name, var, transform_name, feat)) for feat in numaggs_names)) + + # 7) robust subset of raw_vals, ie, within 0.1 and 0.9 quantiles. Or, better, using Tukey fences to identify outliers. + if robust_features: + _, q1_idx, q3_idx = get_numaggs_metadata(numaggs_names=simple_numaggs_names) + if q1_idx and q3_idx: + Q3 = simple_numerical_features[q3_idx] + Q1 = simple_numerical_features[q1_idx] + IQR = Q3 - Q1 + Lower_Bound = Q1 - 1.5 * IQR + Upper_Bound = Q3 + 1.5 * IQR + robust_subset = raw_vals[(raw_vals >= Lower_Bound) & (raw_vals <= Upper_Bound)] + #logger.info(f"Var={var}, Q1={Q1}, Q3={Q3}, len(robust_subset)={len(robust_subset)}") + + if len(robust_subset) == 0: + row_features.extend([np.nan] * len(numaggs_names)) + else: + row_features.extend(compute_numaggs(robust_subset, **numaggs_kwds)) + if create_features_names: + features_names.extend((captions_vars_sep.join((dataset_name, var, "rbst", feat)) for feat in numaggs_names)) + # 8) for some variables, especially with many repeated values, or categorical, we can do value_counts(normalize=True or False). + if counts_processing_mask_regexp and counts_processing_mask_regexp.search(var): + row_features.extend(compute_countaggs(window_df[var], **countaggs_kwds)) + if create_features_names: + features_names.extend((captions_vars_sep.join((dataset_name, var, "vlscnt", feat)) for feat in countaggs_names)) + + if subsets: + for subset_var, subset_var_values in subsets.items(): + if subset_var in checked_subsets or subset_var not in window_df: + continue + + for subset_var_value in subset_var_values: + idx=window_df[subset_var] == subset_var_value + subset_df = window_df[idx] + subset_direct=True + if len(subset_df)<=1: + #logger.warning(f"Empty subset {subset_var}={subset_var_value} (size={len(subset_df)}). let's use reverse to keep ndims. Block length before was {len(row_features)}") + subset_df = window_df[~idx] + subset_direct=False + + row_features.append(subset_direct) + subset_dataset_name=None if dataset_name is None else dataset_name + subset_token + subset_var+ "=" + str(subset_var_value) + if create_features_names: + features_names.append(subset_dataset_name+captions_vars_sep+"subset_direct") + + create_aggregated_features( + window_df=subset_df, + row_features=row_features, + create_features_names=create_features_names, + features_names=features_names, + dataset_name=subset_dataset_name, + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + # common settings + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + vars_mask_regexp=vars_mask_regexp, + vars_mask_exclude_regexp=vars_mask_exclude_regexp, + captions_vars_sep=captions_vars_sep, + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + # numericals + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + differences_features=differences_features, + ratios_features=ratios_features, + robust_features=robust_features, + weighting_vars=weighting_vars, + na_fills=na_fills, + span_corrections=span_corrections, + ewma_alphas=ewma_alphas, + rolling=rolling, + nonlinear_transforms=nonlinear_transforms, + nonnormal_vars=nonnormal_vars, + waveletnames=waveletnames, + numaggs_kwds=numaggs_kwds, + splitting_vars=splitting_vars, + drawdown_vars=drawdown_vars, + groupby_vars=groupby_vars, + return_n_finite=return_n_finite, + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + # categoricals + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + process_categoricals=process_categoricals, + counts_processing_mask_regexp=counts_processing_mask_regexp, + countaggs_kwds=countaggs_kwds, + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + # subsets + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + subsets={} if not nested_subsets else subsets, + checked_subsets=checked_subsets + [subset_var], + subset_token=subset_token, + nested_subsets=nested_subsets, + + ) + + # if not subset_direct: + # logger.warning(f"Block length after subset processing became {len(row_features)}") + +def compute_splitting_stats(window_df:pd.DataFrame,dataset_name:str,splitting_vars:dict,var:str,numaggs_names:list,numaggs_values:list, + row_features:list,features_names:list,create_features_names:bool,captions_vars_sep:str="-")->None: + splitting_vals=[] + if create_features_names: + splitting_ratios_names=[] + subvars=splitting_vars[var] + for col in ('minr','maxr'): + try: + col_idx=numaggs_names.index(col) + except Exception as e: + logger.warning(f"Could not find col={col} in numagg fields {numaggs_names}") + else: + index=int(numaggs_values[col_idx]*len(window_df))-1 + for subvar in subvars: + if subvar in window_df: + if "datetime" in window_df[subvar].dtype.name: + pre_sum=(window_df[subvar].iloc[index]-window_df[subvar].iloc[0]).total_seconds() + post_sum=(window_df[subvar].iloc[-1]-window_df[subvar].iloc[index]).total_seconds() + else: + pre_sum=window_df[subvar].iloc[:index].sum() + post_sum=window_df[subvar].iloc[index:].sum() + tot=(pre_sum+post_sum) + splitting_vals.append(pre_sum/tot if tot else 0) + if create_features_names: + splitting_ratios_names.append(captions_vars_sep.join([dataset_name, var, col,subvar,'split'])) + + row_features.extend(splitting_vals) + if create_features_names: + features_names.extend(splitting_ratios_names) + +def create_windowed_features( + start_index: int = 0, + end_index: int = 0, + df: pd.DataFrame = None, + past_processing_fcn: object = None, + future_processing_fcn: object = None, + features_creation_fcn: object = None, + targets_creation_fcn: object = None, + step_size: int = 1, + nrecords_per_period: int = 1, # use when fixed number of records belongs to the same period, for example, features contains values for 24 consecutive hours of a day. + past_windows: dict = {}, # example: {"": [7, 31, 31 * 12], "Load_West": [200e3, 1e6, 5e6]}, + future_windows: dict = {}, # example: {"": [7, 31, 31 * 12], "Load_West": [200e3, 1e6, 5e6]}, + window_index_name: str = "", # example: D for days, or T for Ticks + overlapping: bool = False, + dtype=np.float32, + verbose: bool = False, +): + """Creates, for a given range of indexes/time moments, start_index to end_index over step_size, a number of past and future windows. + Past window(s) will be used to compute ML features, future window(s) - to compute ML target(s). + Windows can be overlapping or not. They can be of a fixed size in records or in units of any numerical variable from the dataframe. + Resulting features & targets are merged into a pandas dataframe. + + How to support a scenario where we want as features the RATIOS of aggregated features over 2 consecutive windows? + What if we want as a target the ratio of some parameter in the future window by its average over 3 consecutive past windows? + For that, we need the ability to combine calculated windows features in an arbitrary way. + + Another advanced scenario: + we need to take 1 big past and 1 big future window for the entire market, compute features (past and future, they are different) for it. + then from that 2 big windows we need to partition by instrument code and create the same features for each instrument. + Finally, as ML features we need the ratio of instrument past features to market past features. + As the ML targets we need one future feature of the instrument divided by the same past feature of the same instrument. + + There are 2 modes. In legacy one, features/captions over different windows are simply merged together in the downstream aggregates calculation funct. + In the advanced mode, ML features and targets have to be constructed manually from the dicts of individually computed windows variables. + + + """ + logger.info("got ranges from %s to %s", start_index, end_index) + + targets = [] + features = [] + + targets_names = [] + features_names = [] + + past_vars_names = [] + future_vars_names = [] + + if not end_index: + end_index = len(df) + + past_nwindows_expected = get_nwindows_expected(future_windows) + future_nwindows_expected = get_nwindows_expected(future_windows) + + for index in tqdmu(range(start_index, end_index, step_size), desc="dataset range", leave=False): + + # one step means only one features line will be added to features and targets lists. + + row_targets = [] + row_features = [] + base_point = index * nrecords_per_period + + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + # Compute future features (targets constituents) first: they are usually less expensive + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + + future_windows_features = create_and_process_windows( + df=df, + base_point=base_point, + windows=future_windows, + apply_fcn=future_processing_fcn, + window_index_name=window_index_name, + overlapping=overlapping, + forward_direction=True, + window_features_names=future_vars_names, + window_features=None if targets_creation_fcn else row_targets, + create_features_names=targets, + verbose=verbose, + ) + + if (len(row_targets) < future_nwindows_expected) and len(future_windows_features) == 0: + # targets could not be fully created, skipping + continue + + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + # Compute past features + # ----------------------------------------------------------------------------------------------------------------------------------------------------- + + past_windows_features = create_and_process_windows( + df=df, + base_point=base_point, + windows=past_windows, + apply_fcn=past_processing_fcn, + window_index_name=window_index_name, + overlapping=overlapping, + forward_direction=False, + window_features_names=past_vars_names, + window_features=None if features_creation_fcn else row_features, + create_features_names=targets, + verbose=verbose, + ) + + if row_features or past_windows_features: + + features.append(row_features) + + if targets_creation_fcn: + targets.append( + targets_creation_fcn(past_windows=past_windows_features if features_creation_fcn else row_features, future_windows=future_windows_features) + ) + else: + targets.append(row_targets) + else: + if not targets: + features_names = [] + targets_names = [] + + if features: + + if targets_creation_fcn: + targets_names = [targets_creation_fcn.__name__] + + X = pd.DataFrame(data=features, columns=features_names if features_creation_fcn else past_vars_names, dtype=dtype) + Y = pd.DataFrame(data=targets, columns=targets_names if targets_creation_fcn else future_vars_names, dtype=dtype) + + if Y.shape[1] == 1: + Y = Y.iloc[:, 0] + + logger.info("computed the features") + + return X, Y + else: + return None, None + + +def create_and_process_windows( + df: pd.DataFrame, + base_point: int, + apply_fcn: object, + windows: dict, + window_features_names: list, + window_features: list, + targets: list, + forward_direction: bool = True, + window_index_name: str = "", # example: D for days, or T for Ticks, R for rows + nrecords_per_period: int = 1, # use when fixed number of records belongs to the same period, for example, data contains values for 24 consecutive hours of a day. + overlapping: bool = False, + verbose: bool = False, +): + """Build all required windows (target or features), from the current base_point of the dataframe. + Apply function to compute features to each window, return dict with window scpecification as a key, list of features as a value. + """ + res = {} + for window_var, windows_lengths in windows.items(): # (p_window_var := tqdmu(windows.items(), desc="window var", leave=False)): + # p_window_var.set_description("window var=" + window_var) + # windows_lengths must be sorted + + if forward_direction: + windows_l = base_point + else: + windows_r = base_point + + if window_var: + if window_var not in df: + break + if forward_direction: + window_var_values = df[window_var].values[base_point:] + else: + window_var_values = df[window_var].values[:base_point] + + for window_order, window_size in enumerate(windows_lengths): # (p_window_size := tqdmu(enumerate(windows_lengths), desc=f"window size", leave=False)): + # p_window_size.set_description("window size=" + str(window_size)) + if window_var == "": # just index + dataset_name = str(window_size) + window_index_name + if forward_direction: + windows_r = min(windows_l + window_size * nrecords_per_period, len(df)) + else: + windows_l = max(windows_r - window_size * nrecords_per_period, 0) + else: + dataset_name = window_var + ":" + get_human_readable_set_size(window_size) + # binary search along that var's cumsum until it reaches the required size + if forward_direction: + windows_r, accumulated_amount = find_next_cumsum_right_index(window_var_values=window_var_values, amount=window_size, left_index=windows_l) + else: + windows_l, accumulated_amount = find_next_cumsum_left_index(window_var_values=window_var_values, amount=window_size, right_index=windows_r) + if accumulated_amount > 0 and accumulated_amount * 2 < window_size: + logger.warning("Insufficient data for window %s of size %s: real size=%s", window_var, window_size, accumulated_amount) + continue + + # Window established. Now need to create features for it. + + window_df = df.iloc[windows_l:windows_r] + if len(window_df): + + if verbose: + logger.info( + "%s, acc.size %s, l=%s, r=%s (%s to %s)", + dataset_name + " " + ("future" if forward_direction else "past"), + accumulated_amount if window_var else "", + windows_l, + windows_r, + window_df.index[0], + window_df.index[-1], + ) + if window_features is not None: + apply_fcn(df=window_df, row_features=window_features, targets=targets, features_names=window_features_names, dataset_name=dataset_name) + else: + temp_window_features = [] + apply_fcn(df=window_df, row_features=temp_window_features, targets=targets, features_names=window_features_names, dataset_name=dataset_name) + res[dataset_name] = temp_window_features + + # print( + # window_var, window_order, window_size, time_from_ts(window_df.MOMENT.values[0]), time_from_ts(window_df.MOMENT.values[-1]) + # ) # ,"->",time_from_ts(future.MOMENT.values[0]),time_from_ts(future.MOMENT.values[-1]),) + + if not overlapping: + if forward_direction: + windows_l = windows_r + else: + windows_r = windows_l + return res + + +def create_ts_features_parallel( + start_index: int, end_index: int = None, ts_func: object = None, n_cores: int = None, logical: bool = False, n_chunks: int = None, **kwargs +): + """ + Divides a span (end_index-start_index) into n_chunks non-overlapping consecutive chunks. + Submits FE pipeline execution in parallel, joins results. + """ + nrecords_per_period = kwargs.get("nrecords_per_period", 1) + + if not end_index: + end_index = len(kwargs.get("df", [])) + if not end_index: + return None, None + end_index = end_index // nrecords_per_period + + if not n_chunks or n_chunks < 0: + n_chunks = min(int(psutil.cpu_count(logical=True) * 1.5), (end_index - start_index)) + + args = [] + step = (end_index - start_index) // n_chunks + if step < 1: + return None, None + + l = start_index + for i in range(n_chunks): + r = min(l + step, end_index) + if i == n_chunks - 1: + r = end_index + + args.append((l, r)) + + l = r + + logger.info("starting applyfunc_parallel using args %s", args) + res = applyfunc_parallel(args, partial(ts_func, **kwargs), return_dataframe=False, logical=logical, n_cores=n_cores) + X_parts, Y_parts = [], [] + for X_part, Y_part in res: + X_parts.append(X_part) + Y_parts.append(Y_part) + + del res + return pd.concat(X_parts, ignore_index=True), pd.concat(Y_parts, ignore_index=True) + + +# ----------------------------------------------------------------------------------------------------------------------------------------------------- +# Detecting optimal lags for FE +# ----------------------------------------------------------------------------------------------------------------------------------------------------- + + +def compute_corr(dependent_vals: np.ndarray, independent_vals: np.ndarray, deciding_func: object, absolutize: bool = True): + if deciding_func is np.corrcoef: + corr = deciding_func(dependent_vals, independent_vals)[0][1] + else: + corr = deciding_func(dependent_vals.reshape(-1, 1), independent_vals)[0] + + if absolutize: + corr = np.abs(corr) + + return corr + + +def general_acf( + Y: np.ndarray, X: np.ndarray = None, windows: dict = {}, deciding_func: object = np.corrcoef, lag_len: int = 30, min_samples=500, absolutize: bool = True +): + """Advanced ACF(nonlinear, +over variables with non-fixed offsets). + windows={var: {from,to,nsteps]}, ie {"Load":{"from":40_000,"to":1e6,"nsteps":100}} + """ + res = {} + + if lag_len: + acfs_vals = [1.0] + acfs_index = [1.0] + for i in tqdmu(range(lag_len), desc="Fixed offsets"): + if len(Y) - i >= min_samples: + dependent_vals = Y[i + 1 :] + independent_vals = Y[: -(i + 1)] + + corr = compute_corr(dependent_vals=dependent_vals, independent_vals=independent_vals, deciding_func=deciding_func, absolutize=absolutize) + + acfs_vals.append(corr) + acfs_index.append(i + 1) + + res["fixed_offsets"] = pd.Series(data=acfs_vals, name="fixed_offsets", index=acfs_index) + + if windows: + for window_var, windows_params in tqdmu(windows.items(), desc="Flexible windows"): # windows_lengths must be sorted + window_var_values = X[window_var].values + acfs_vals = [1.0] + acfs_index = [0.0] + window_sizes = np.linspace( + start=windows_params.get("from", max(1, window_var_values.min())), + stop=windows_params.get("to", window_var_values.sum()), + num=windows_params.get("nsteps", 100), + ) + + for window_size in tqdmu(window_sizes, desc=window_var): + dependent_vals = [] + independent_vals = [] + windows_r = len(window_var_values) - 1 + + while True: + # binary search along that var's cumsum until it reaches required size + windows_l, accumulated_amount = find_next_cumsum_left_index(window_var_values=window_var_values, amount=window_size, right_index=windows_r) + if accumulated_amount * 2 < window_size: # or (windows_r - windows_l) < min_samples + break + else: + dependent_vals.append(Y[windows_r]) + independent_vals.append(Y[windows_l]) + + windows_r = windows_l + if windows_l <= 0: + break + if dependent_vals: + corr = compute_corr( + dependent_vals=np.array(dependent_vals), independent_vals=np.array(independent_vals), deciding_func=deciding_func, absolutize=absolutize + ) + + acfs_vals.append(corr) + acfs_index.append(window_size) + + res[window_var] = pd.Series(data=acfs_vals, name=window_var, index=acfs_index) + return res diff --git a/feature_importance.py b/mlframe/feature_importance.py similarity index 100% rename from feature_importance.py rename to mlframe/feature_importance.py diff --git a/mlframe/feature_selection/__init__.py b/mlframe/feature_selection/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mlframe/feature_selection/filters.py b/mlframe/feature_selection/filters.py new file mode 100644 index 0000000..b3a14aa --- /dev/null +++ b/mlframe/feature_selection/filters.py @@ -0,0 +1,3449 @@ +"""Feature selection within ML pipelines. Based on filters. Currently includes mRMR.""" + +# ---------------------------------------------------------------------------------------------------------------------------- +# LOGGING +# ---------------------------------------------------------------------------------------------------------------------------- + +import logging + +logger = logging.getLogger(__name__) + +while True: + try: + # ---------------------------------------------------------------------------------------------------------------------------- + # Normal Imports + # ---------------------------------------------------------------------------------------------------------------------------- + + from typing import * + + import copy + + import pandas as pd, numpy as np + import os + import gc + + from pyutilz.system import tqdmu + from mlframe.utils import set_random_seed + from pyutilz.pythonlib import store_params_in_object, get_parent_func_args + from pyutilz.parallel import mem_map_array, split_list_into_chunks, parallel_run + from pyutilz.numbalib import set_numba_random_seed, arr2str, python_dict_2_numba_dict, generate_combinations_recursive_njit + + # from mlframe.boruta_shap import BorutaShap + from timeit import default_timer as timer + + from scipy.stats import mode + from scipy import special as sp + from itertools import combinations + from numpy.polynomial.hermite import hermval + from pyutilz.pythonlib import sort_dict_by_value + + from sklearn.preprocessing import KBinsDiscretizer, OrdinalEncoder + from sklearn.model_selection import KFold + from sklearn.base import is_classifier, is_regressor, BaseEstimator, TransformerMixin + from sklearn.impute import SimpleImputer + from itertools import combinations + from numba.core import types + from mlframe.arrays import arrayMinMax + from numba import njit, jit + import numba + import math + + from joblib import Parallel, delayed + + from astropy.stats import histogram + + from numba import NumbaDeprecationWarning, NumbaPendingDeprecationWarning + import warnings + + except ModuleNotFoundError as e: + + logger.warning(e) + + if "cannot import name" in str(e): + raise (e) + + # ---------------------------------------------------------------------------------------------------------------------------- + # Packages auto-install + # ---------------------------------------------------------------------------------------------------------------------------- + + from pyutilz.pythonlib import ensure_installed + + ensure_installed("numpy pandas scikit-learn") # cupy-cuda11x + + else: + break + +# ---------------------------------------------------------------------------------------------------------------------------- +# Inits +# ---------------------------------------------------------------------------------------------------------------------------- + +warnings.filterwarnings("ignore", module=".*_discretization") +warnings.simplefilter("ignore", category=NumbaDeprecationWarning) +warnings.simplefilter("ignore", category=NumbaPendingDeprecationWarning) + +MAX_JOBLIB_NBYTES = 1e3 +NMAX_NONPARALLEL_ITERS = 2 +MAX_ITERATIONS_TO_TRACK = 5 + +LARGE_CONST: float = 1e30 +GPU_MAX_BLOCK_SIZE: int = 1024 +MAX_CONFIRMATION_CAND_NBINS: int = 50 + +caching_hits_xyz = 0 +caching_hits_z = 0 +caching_hits_xz = 0 +caching_hits_yz = 0 + + +def init_kernels(): + + global compute_joint_hist_cuda, compute_mi_from_classes_cuda + + import cupy as cp # pip install cupy-cuda11x; python -m cupyx.tools.install_library --cuda 11.x --library cutensor + + compute_joint_hist_cuda = cp.RawKernel( + r""" + extern "C" __global__ + void compute_joint_hist_cuda(const int *classes_x, const int *classes_y, int *joint_counts, int n, int nbins_y) { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid0){ + float prob_y = freqs_y[j]; + double jf=(float)jc/ (float)n; + total += jf* log(jf / (prob_x * prob_y)); + } + } + } + totals[0]=total; + } + } + """, + "compute_mi_from_classes_cuda", + ) + + +# ---------------------------------------------------------------------------------------------------------------------------- +# Old code +# ---------------------------------------------------------------------------------------------------------------------------- + + +def find_impactful_features( + X: pd.DataFrame, + Y: pd.DataFrame, + feature_selector: object = None, + model: object = None, + importance_measure: str = "shap", + classification: bool = False, + n_trials: int = 150, + normalize: bool = True, + train_or_test="train", + verbose: bool = True, + fit_params: dict = {}, +) -> dict: + """ + Create a dict of inputs impacting every and all target (multitarget supported). + Wrapped models are not supported (like TransformedTargetRegressor). + """ + + if verbose: + logger.info("Starting impact analysis for %s row(s), %s feature(s), %s target(s)", X.shape[0], X.shape[1], Y.shape[1]) + + if not feature_selector: + feature_selector = BorutaShap( + model=model, + importance_measure=importance_measure, + classification=classification, + n_trials=n_trials, + normalize=normalize, + verbose=False, + train_or_test=train_or_test, + fit_params=fit_params, + ) + + if False: # when multioutput is not supported + max_targets = 0 + res = {"accepted": {}, "tentative": {}} + wholeset_accepted, wholeset_tentative = [], [] + + for var in tqdmu(range(Y.shape[1]), desc="target #"): + if max_targets: + if var >= max_targets: + break + feature_selector.fit(X=X, y=Y.iloc[:, var], n_trials=n_trials, normalize=normalize, verbose=False, train_or_test=train_or_test) + + res["accepted"][var] = feature_selector.accepted + res["tentative"][var] = feature_selector.tentative + + if verbose: + logger.info( + "%s feature(s) found impactful on target %s: %s, %s tentative: %s", + len(feature_selector.accepted), + var, + sorted(feature_selector.accepted), + len(feature_selector.tentative), + sorted(feature_selector.tentative), + ) + + wholeset_accepted.extend(feature_selector.accepted) + wholeset_tentative.extend(feature_selector.tentative) + + res["wholeset"] = {"accepted": set(wholeset_accepted), "tentative": set(wholeset_tentative)} + res["mostvoted_accepted"] = Counter(wholeset_accepted) + res["mostvoted_tentative"] = Counter(wholeset_tentative) + else: + feature_selector.fit(X=X, y=Y) + res = {"accepted": sorted(feature_selector.accepted), "tentative": sorted(feature_selector.tentative)} + if verbose: + logger.info(res) + return res + + +# ---------------------------------------------------------------------------------------------------------------------------- +# Multidimensial vectors +# ---------------------------------------------------------------------------------------------------------------------------- + + +@njit() +def merge_vars( + factors_data: np.ndarray, + vars_indices: Sequence, + var_is_nominal: Sequence, + factors_nbins: Sequence, + dtype=np.int32, + min_occupancy: int = None, + current_nclasses: int = 1, + final_classes: np.ndarray = None, # , dtype=np.int32 + verbose: bool = False, +) -> tuple: + """Melts multiple vectors (partitioned into bins or categories) into one-dimensional one. + factors_data columns are assumed to be ordinal-encoded already (by KBinsDiscretizer or similar). + Bins, scarcely populated in higher dimension, may be merged with nearest neighbours (in eucl. distance). + + Process truly numerical vars first? To be able to distribute scarcely populated bins more meaningfully, according to the distances. + What order is best to use in regard of number of categories and variance of their population density? Big-small,big-small, etc? Starting from most (un)evenly distributed? + """ + + if final_classes is None: + final_classes = np.zeros(len(factors_data), dtype=dtype) + for var_number, var_index in enumerate(vars_indices): + + expected_nclasses = current_nclasses * factors_nbins[var_index] + freqs = np.zeros(expected_nclasses, dtype=dtype) + values = factors_data[:, var_index].astype(dtype) + if verbose: + print(f"var={var_index}, classes from {values.min()} to {values.max()}") + for sample_row, sample_class in enumerate(values): + newclass = final_classes[sample_row] + sample_class * current_nclasses + freqs[newclass] += 1 + final_classes[sample_row] = newclass + + # clean zeros + nzeros = 0 + lookup_table = np.empty(expected_nclasses, dtype=dtype) + for oldclass, npoints in enumerate(freqs): + if npoints == 0: + nzeros += 1 + else: + pass + # points from low-populated regions can be + # 1) distributed into closest N regions proportional to distances to them + # 2) ALL placed into some separate OUTLIERS region + # 3) left as is + + # if npoints 0] + current_nclasses = expected_nclasses - nzeros + return final_classes, freqs / len(factors_data), current_nclasses + + +# ---------------------------------------------------------------------------------------------------------------------------- +# Information-theory measures +# ---------------------------------------------------------------------------------------------------------------------------- + + +@njit() +def entropy(freqs: np.ndarray, min_occupancy: float = 0) -> float: + if min_occupancy: + freqs = freqs[freqs >= min_occupancy] + + return -(np.log(freqs) * freqs).sum() + + +@njit() +def mi(factors_data, x: np.ndarray, y: np.ndarray, factors_nbins: np.ndarray, verbose: bool = False, dtype=np.int32) -> float: + """Computes Mutual Information of X, Y via entropy calculations.""" + + classes_x, freqs_x, _ = merge_vars( + factors_data=factors_data, vars_indices=x, var_is_nominal=None, factors_nbins=factors_nbins, verbose=verbose, dtype=dtype + ) + entropy_x = entropy(freqs=freqs_x) + if verbose: + print(f"entropy_x={entropy_x}, nclasses_x={len(freqs_x)} ({classes_x.min()} to {classes_x.max()})") + + _, freqs_y, _ = merge_vars(factors_data=factors_data, vars_indices=y, var_is_nominal=None, factors_nbins=factors_nbins, verbose=verbose, dtype=dtype) + entropy_y = entropy(freqs=freqs_y) + if verbose: + print(f"entropy_y={entropy_y}, nclasses_y={len(freqs_y)}") + + classes_xy, freqs_xy, _ = merge_vars( + factors_data=factors_data, vars_indices=set(x) | set(y), var_is_nominal=None, factors_nbins=factors_nbins, verbose=verbose, dtype=dtype + ) + entropy_xy = entropy(freqs=freqs_xy) + if verbose: + print(f"entropy_xy={entropy_xy}, nclasses_x={len(freqs_xy)} ({classes_xy.min()} to {classes_xy.max()})") + + return entropy_x + entropy_y - entropy_xy + + +@njit() +def unpack_and_sort(x, z): + res = [] + for el in x: + res.append(el) + for el in z: + res.append(el) + return sorted(res) + + +@njit() +def conditional_mi( + factors_data: np.ndarray, + x: tuple, + y: tuple, + z: tuple, + var_is_nominal: Sequence, + factors_nbins: Sequence, + entropy_z: float = -1.0, + entropy_xz: float = -1.0, + entropy_yz: float = -1.0, + entropy_xyz: float = -1.0, + entropy_cache: dict = None, + can_use_x_cache: bool = False, + can_use_y_cache: bool = False, + dtype=np.int32, +) -> float: + """ + Conditional Mutual Information about Y by X given Z = I (X ;Y | Z ) = H(X, Z) + H(Y, Z) - H(Z) - H(X, Y, Z) + When y is constant, and both X(candidates) and Z (veteranes) repeat a lot, there's room to optimize. + Also when parts of X repeat a lot (2, 3 way interactions). Z and Y are always 1-dim. + """ + # global caching_hits_xyz, caching_hits_z, caching_hits_xz, caching_hits_yz + + key = "" + + if entropy_z < 0: + if entropy_cache is not None: + key = arr2str(sorted(z)) + entropy_z = entropy_cache.get(key, -1) + if entropy_z < 0: + _, freqs_z, _ = merge_vars(factors_data=factors_data, vars_indices=z, var_is_nominal=None, factors_nbins=factors_nbins, dtype=dtype) # always 1-dim + entropy_z = entropy(freqs=freqs_z) + if entropy_cache is not None: + entropy_cache[key] = entropy_z + # else: + # caching_hits_z += 1 + + if entropy_xz < 0: + + # indices = sorted([*x, *z]) + indices = unpack_and_sort(x, z) + + if can_use_x_cache and entropy_cache is not None: + key = arr2str(indices) + entropy_xz = entropy_cache.get(key, -1) + if entropy_xz < 0: + _, freqs_xz, _ = merge_vars(factors_data=factors_data, vars_indices=indices, var_is_nominal=None, factors_nbins=factors_nbins, dtype=dtype) + entropy_xz = entropy(freqs=freqs_xz) + if can_use_x_cache and entropy_cache is not None: + entropy_cache[key] = entropy_xz + # else: + # caching_hits_xz += 1 + + current_nclasses_yz = 1 + if can_use_y_cache: + if entropy_yz < 0: + + # indices = sorted([*y, *z]) + indices = unpack_and_sort(y, z) + + if entropy_cache is not None: + key = arr2str(indices) + entropy_yz = entropy_cache.get(key, -1) + if entropy_yz < 0: + classes_yz, freqs_yz, current_nclasses_yz = merge_vars( + factors_data=factors_data, vars_indices=indices, var_is_nominal=None, factors_nbins=factors_nbins, dtype=dtype + ) + entropy_yz = entropy(freqs=freqs_yz) + if entropy_cache is not None: + entropy_cache[key] = entropy_yz + # else: + # caching_hits_yz += 1 + else: + classes_yz, freqs_yz, current_nclasses_yz = merge_vars( + factors_data=factors_data, vars_indices=unpack_and_sort(y, z), var_is_nominal=None, factors_nbins=factors_nbins, dtype=dtype # [*y, *z] + ) # always 2-dim + entropy_yz = entropy(freqs=freqs_yz) + + if entropy_xyz < 0: + if can_use_y_cache and can_use_x_cache: + + # indices = sorted([*x, *y, *z]) + indices = unpack_and_sort(x, y) + indices = unpack_and_sort(indices, z) + + if entropy_cache is not None: + key = arr2str(indices) + entropy_xyz = entropy_cache.get(key, -1) + if entropy_xyz < 0: + if current_nclasses_yz == 1: + classes_yz, freqs_yz, current_nclasses_yz = merge_vars( + factors_data=factors_data, vars_indices=unpack_and_sort(y, z), var_is_nominal=None, factors_nbins=factors_nbins, dtype=dtype # [*y, *z] + ) # always 2-dim + + _, freqs_xyz, _ = merge_vars( + # factors_data=factors_data, vars_indices=[*y, *z, *x], var_is_nominal=None, factors_nbins=factors_nbins + factors_data=factors_data, + vars_indices=x, + var_is_nominal=None, + factors_nbins=factors_nbins, + current_nclasses=current_nclasses_yz, + final_classes=classes_yz, + dtype=dtype, + ) # upper classes of [*y, *z] can serve here. (2+x)-dim, ie 3 to 5 dim. + entropy_xyz = entropy(freqs=freqs_xyz) + if entropy_cache is not None and can_use_y_cache and can_use_x_cache: + entropy_cache[key] = entropy_xyz + # else: + # caching_hits_xyz += 1 + res = entropy_xz + entropy_yz - entropy_z - entropy_xyz + if res < 0.0: + # print(x, y, z, res, can_use_y_cache) + res = 0.0 + return res + + +@njit() +def compute_mi_from_classes( + classes_x: np.ndarray, + freqs_x: np.ndarray, + classes_y: np.ndarray, + freqs_y: np.ndarray, + dtype=np.int32, +) -> float: + + joint_counts = np.zeros((len(freqs_x), len(freqs_y)), dtype=dtype) + for i, j in zip(classes_x, classes_y): + joint_counts[i, j] += 1 + joint_freqs = joint_counts / len(classes_x) + + total = 0.0 + for i in range(len(freqs_x)): + prob_x = freqs_x[i] + for j in range(len(freqs_y)): + jf = joint_freqs[i, j] + if jf: + prob_y = freqs_y[j] + total += jf * math.log(jf / (prob_x * prob_y)) + return total + + +@njit() +def distribute_permutations(npermutations: int, n_workers: int) -> list: + + avg_perms_per_worker = npermutations // n_workers + diff = npermutations - avg_perms_per_worker * n_workers + workload = [avg_perms_per_worker] * n_workers + if diff > 0: + workload[-1] = workload[-1] + diff + return workload + + +# @njit() +def mi_direct( + factors_data, + x: tuple, + y: tuple, + factors_nbins: np.ndarray, + min_occupancy: int = None, + dtype=np.int32, + npermutations: int = 10, + max_failed: int = None, + min_nonzero_confidence: float = 0.95, + classes_y: np.ndarray = None, + classes_y_safe: np.ndarray = None, + freqs_y: np.ndarray = None, + n_workers: int = 1, + workers_pool: object = None, + parallel_kwargs: dict = {}, +) -> tuple: + + classes_x, freqs_x, _ = merge_vars(factors_data=factors_data, vars_indices=x, var_is_nominal=None, factors_nbins=factors_nbins, dtype=dtype) + if classes_y is None: + classes_y, freqs_y, _ = merge_vars(factors_data=factors_data, vars_indices=y, var_is_nominal=None, factors_nbins=factors_nbins, dtype=dtype) + + original_mi = compute_mi_from_classes(classes_x=classes_x, freqs_x=freqs_x, classes_y=classes_y, freqs_y=freqs_y, dtype=dtype) + + # logger.info(f"original_mi={original_mi}") + + if original_mi > 0 and npermutations > 0: + + # Inits + + nfailed = 0 + if not max_failed: + max_failed = int(npermutations * (1 - min_nonzero_confidence)) + if max_failed <= 1: + max_failed = 1 + + if n_workers and n_workers > 1 and npermutations > NMAX_NONPARALLEL_ITERS: + + # logger.info("Memmapping classes_x...") + # classes_x_memmap = mem_map_array(obj=classes_x, file_name="classes_x", mmap_mode="r") + + if workers_pool is None: + workers_pool = Parallel(n_jobs=n_workers, **parallel_kwargs) + + res = workers_pool( + delayed(parallel_mi)( + classes_x=classes_x, + freqs_x=freqs_x, + classes_y=classes_y_safe, + freqs_y=freqs_y, + dtype=dtype, + npermutations=worker_npermutations, + original_mi=original_mi, + max_failed=max_failed, + ) + for worker_npermutations in distribute_permutations(npermutations=npermutations, n_workers=n_workers) + ) + + i = 0 + for worker_nfailed, worker_i in res: + nfailed += worker_nfailed + i += worker_i + + if nfailed >= max_failed: + original_mi = 0.0 + + else: + + if classes_y_safe is None: + classes_y_safe = classes_y.copy() + + # Create a Generator instance with a specific RNG state + # seed=int.from_bytes(os.urandom(4), byteorder='little') + # rng = np.random.default_rng(seed) + + for i in range(npermutations): + # logger.info(f"x={x} perm {i}") + # np.random.shuffle(classes_y_safe) + shuffle_arr(classes_y_safe) + # Shuffle the array using the local RNG + # classes_y_shuffled=np.random.choice(classes_y, len(classes_y), replace=False) + # rng.shuffle(classes_y_safe) + # logger.info(f"x={x} shuffled") + # mi = compute_mi_from_classes(classes_x=classes_x, freqs_x=freqs_x, classes_y=classes_y_shuffled, freqs_y=freqs_y, dtype=dtype) + mi = compute_mi_from_classes(classes_x=classes_x, freqs_x=freqs_x, classes_y=classes_y_safe, freqs_y=freqs_y, dtype=dtype) + + if mi >= original_mi: + nfailed += 1 + if nfailed >= max_failed: + original_mi = 0.0 + break + + confidence = 1 - nfailed / (i + 1) + + else: + confidence = 0 + + return original_mi, confidence + + +@njit() +def shuffle_arr(arr: np.ndarray) -> None: + np.random.shuffle(arr) + + +@njit() +def parallel_mi( + classes_x: np.ndarray, + freqs_x: np.ndarray, + classes_y: np.ndarray, + freqs_y: np.ndarray, + npermutations: int, + original_mi: float, + max_failed: int, + dtype=np.int32, +) -> int: + + # print("In parallel_mi. Copying classes_y...") + nfailed = 0 + classes_y_safe = np.asarray(classes_y).copy() + # print("In parallel_mi. Copied.") + for i in range(npermutations): + + np.random.shuffle(classes_y_safe) + mi = compute_mi_from_classes(classes_x=classes_x, freqs_x=freqs_x, classes_y=classes_y_safe, freqs_y=freqs_y, dtype=dtype) + + if mi >= original_mi: + nfailed += 1 + if nfailed >= max_failed: + break + return nfailed, i + 1 + + +def mi_direct_gpu( + factors_data, + x: tuple, + y: tuple, + factors_nbins: np.ndarray, + min_occupancy: int = None, + dtype=np.int32, + npermutations: int = 10, + max_failed: int = None, + min_nonzero_confidence: float = 0.95, + classes_y: np.ndarray = None, + classes_y_safe: np.ndarray = None, + freqs_y: np.ndarray = None, + freqs_y_safe: np.ndarray = None, + use_gpu: bool = True, +) -> tuple: + + classes_x, freqs_x, _ = merge_vars(factors_data=factors_data, vars_indices=x, var_is_nominal=None, factors_nbins=factors_nbins, dtype=dtype) + if classes_y is None: + classes_y, freqs_y, _ = merge_vars(factors_data=factors_data, vars_indices=y, var_is_nominal=None, factors_nbins=factors_nbins, dtype=dtype) + + original_mi = compute_mi_from_classes(classes_x=classes_x, freqs_x=freqs_x, classes_y=classes_y, freqs_y=freqs_y, dtype=dtype) + + if original_mi > 0 and npermutations > 0: + + # Inits + + if not max_failed: + max_failed = int(npermutations * (1 - min_nonzero_confidence)) + if max_failed <= 1: + max_failed = 1 + + if classes_y_safe is None: + classes_y_safe = cp.asarray(classes_y).astype(cp.int32) + if freqs_y_safe is None: + freqs_y_safe = cp.asarray(freqs_y) + + totals = cp.zeros(1, dtype=np.float64) + joint_counts = cp.zeros((len(freqs_x), len(freqs_y)), dtype=cp.int32) + + block_size = GPU_MAX_BLOCK_SIZE + grid_size = math.ceil(len(classes_x) / block_size) + + classes_x = cp.asarray(classes_x.astype(np.int32)) + freqs_x = cp.asarray(freqs_x) + nfailed = 0 + for i in range(npermutations): + + cp.random.shuffle(classes_y_safe) + joint_counts.fill(0) + compute_joint_hist_cuda( + (grid_size,), + (block_size,), + (classes_x, classes_y_safe, joint_counts, len(classes_x), len(freqs_y)), + ) + compute_mi_from_classes_cuda( + (1,), + (1,), + (classes_x, freqs_x, classes_y_safe, freqs_y_safe, joint_counts, totals, len(classes_x), len(freqs_x), len(freqs_y)), + ) + + mi = totals.get()[0] + + if mi >= original_mi: + nfailed += 1 + if nfailed >= max_failed: + original_mi = 0.0 + break + confidence = 1 - nfailed / (i + 1) + else: + confidence = 0 + + return original_mi, confidence + + +# ---------------------------------------------------------------------------------------------------------------------------- +# Factor screening +# ---------------------------------------------------------------------------------------------------------------------------- + + +def get_candidate_name(candidate_indices: list, factors_names: list) -> str: + cand_name = "-".join([factors_names[el] for el in candidate_indices]) + return cand_name + + +def should_skip_candidate( + cand_idx: int, + X: tuple, + interactions_order: int, + failed_candidates: set, + added_candidates: set, + expected_gains: np.ndarray, + selected_vars: list, + selected_interactions_vars: list, + only_unknown_interactions: bool = True, +) -> tuple: + """Decides if current candidate for predictors should be skipped + ('cause of being already accepted, failed, computed). + """ + + nexisting = 0 + + if (cand_idx in failed_candidates) or (cand_idx in added_candidates) or expected_gains[cand_idx]: + return True, nexisting + + if interactions_order > 1: # disabled for single predictors 'cause Fleuret formula won't detect pairs predictors + + # --------------------------------------------------------------------------------------------------------------- + # Check if any of sub-elements is already selected at this stage + # --------------------------------------------------------------------------------------------------------------- + + skip_cand = False + for subel in X: + if subel in selected_interactions_vars: + skip_cand = True + break + if skip_cand: + return True, nexisting + + # --------------------------------------------------------------------------------------------------------------- + # Or all selected at the lower stages + # --------------------------------------------------------------------------------------------------------------- + + skip_cand = [(subel in selected_vars) for subel in X] + nexisting = sum(skip_cand) + if (only_unknown_interactions and any(skip_cand)) or all(skip_cand): + return True, nexisting + + return False, nexisting + + +def get_fleuret_criteria_confidence_parallel( + data_copy: np.ndarray, + factors_nbins: np.ndarray, + x: tuple, + y: tuple, + selected_vars: list, + bootstrapped_gain: float, + npermutations: int, + max_failed: int, + nexisting: int, + mrmr_relevance_algo: str = "fleuret", + mrmr_redundancy_algo: str = "fleuret", + max_veteranes_interactions_order: int = 1, + extra_knowledge_multipler: float = -1.0, + sink_threshold: float = -1.0, + cached_cond_MIs: dict = None, + n_workers: int = 1, + workers_pool: object = None, + parallel_kwargs: dict = {}, + entropy_cache: dict = None, + extra_x_shuffling: bool = True, + dtype=np.int32, +) -> tuple: + + nfailed = 0 + + if workers_pool is None: + workers_pool = Parallel(n_jobs=n_workers, **parallel_kwargs) + + gc.collect() + res = workers_pool( + delayed(parallel_fleuret)( + data=data_copy, + factors_nbins=factors_nbins, + x=x, + y=y, + selected_vars=selected_vars, + npermutations=worker_npermutations, + bootstrapped_gain=bootstrapped_gain, + max_failed=max_failed, + nexisting=nexisting, + mrmr_relevance_algo=mrmr_relevance_algo, + mrmr_redundancy_algo=mrmr_redundancy_algo, + max_veteranes_interactions_order=max_veteranes_interactions_order, + extra_knowledge_multipler=extra_knowledge_multipler, + sink_threshold=sink_threshold, + cached_cond_MIs=dict(cached_cond_MIs), # cached_cond_MIs needs to be unjitted before sending to joblib. + entropy_cache=dict(entropy_cache), # entropy_cache needs to be unjitted before sending to joblib. + extra_x_shuffling=extra_x_shuffling, + dtype=dtype, + ) + for worker_npermutations in distribute_permutations(npermutations=npermutations, n_workers=n_workers) + ) + + nchecked = 0 + for worker_nfailed, worker_i, entropy_cache_dict in res: + nfailed += worker_nfailed + nchecked += worker_i + for key, value in entropy_cache_dict.items(): + entropy_cache[key] = value + + if nfailed >= max_failed: + bootstrapped_gain = 0.0 + + confidence = 1 - nfailed / nchecked + + return bootstrapped_gain, confidence, entropy_cache + + +def parallel_fleuret( + data: np.ndarray, + factors_nbins: np.ndarray, + x: tuple, + y: tuple, + selected_vars: list, + npermutations: int, + bootstrapped_gain: float, + max_failed: int, + nexisting: int, + mrmr_relevance_algo: str = "fleuret", + mrmr_redundancy_algo: str = "fleuret", + max_veteranes_interactions_order: int = 1, + extra_knowledge_multipler: float = -1.0, + sink_threshold: float = -1.0, + cached_cond_MIs: dict = None, + entropy_cache: dict = None, + extra_x_shuffling: bool = True, + dtype=np.int32, +): + """Stub that gets called by joblib directly. Njits entropy_cache, allocates data copy. Calls fast njitted core sub.""" + data_copy = data.copy() + + entropy_cache_dict = numba.typed.Dict.empty( + key_type=types.unicode_type, + value_type=types.float64, + ) + python_dict_2_numba_dict(python_dict=entropy_cache, numba_dict=entropy_cache_dict) + + cached_cond_MIs_dict = numba.typed.Dict.empty( + key_type=types.unicode_type, + value_type=types.float64, + ) + python_dict_2_numba_dict(python_dict=cached_cond_MIs, numba_dict=cached_cond_MIs_dict) + + nfailed, i = get_fleuret_criteria_confidence( + data_copy=data_copy, + factors_nbins=factors_nbins, + x=x, + y=y, + selected_vars=selected_vars, + npermutations=npermutations, + bootstrapped_gain=bootstrapped_gain, + max_failed=max_failed, + nexisting=nexisting, + mrmr_relevance_algo=mrmr_relevance_algo, + mrmr_redundancy_algo=mrmr_redundancy_algo, + max_veteranes_interactions_order=max_veteranes_interactions_order, + cached_cond_MIs=cached_cond_MIs_dict, + entropy_cache=entropy_cache_dict, + extra_x_shuffling=extra_x_shuffling, + dtype=dtype, + ) + + return nfailed, i, dict(entropy_cache_dict) + + +@njit() +def get_fleuret_criteria_confidence( + data_copy: np.ndarray, + factors_nbins: np.ndarray, + x: tuple, + y: tuple, + selected_vars: list, + npermutations: int, + bootstrapped_gain: float, + max_failed: int, + nexisting: int, + mrmr_relevance_algo: str = "fleuret", + mrmr_redundancy_algo: str = "fleuret", + max_veteranes_interactions_order: int = 1, + extra_knowledge_multipler: float = -1.0, + sink_threshold: float = -1.0, + cached_cond_MIs: dict = None, + entropy_cache: dict = None, + extra_x_shuffling: bool = True, + dtype=np.int32, +) -> tuple: + """Sub to njit work with random shuffling as well.""" + + nfailed = 0 + + for i in range(npermutations): + + for idx in y: + np.random.shuffle(data_copy[:, idx]) + + if extra_x_shuffling: + for idx in x: + np.random.shuffle(data_copy[:, idx]) + + stopped_early, current_gain, k, sink_reasons = evaluate_gain( + current_gain=LARGE_CONST, + last_checked_k=-1, + X=x, + y=y, + best_gain=None, + factors_data=data_copy, + factors_nbins=factors_nbins, + selected_vars=selected_vars, + nexisting=nexisting, + direct_gain=bootstrapped_gain, + mrmr_relevance_algo=mrmr_relevance_algo, + mrmr_redundancy_algo=mrmr_redundancy_algo, + max_veteranes_interactions_order=max_veteranes_interactions_order, + extra_knowledge_multipler=extra_knowledge_multipler, + sink_threshold=sink_threshold, + cached_cond_MIs=cached_cond_MIs, + entropy_cache=entropy_cache, + can_use_x_cache=not extra_x_shuffling, + can_use_y_cache=False, + confidence_mode=True, + ) + + if current_gain >= bootstrapped_gain: + nfailed += 1 + if nfailed >= max_failed: + break + + return nfailed, i + 1 + + +def evaluate_candidates( + workload: list, + y: Sequence[int], + best_gain: float, + factors_data: np.ndarray, + factors_nbins: np.ndarray, + factors_names: Sequence[str], + partial_gains: dict, + selected_vars: list, + baseline_npermutations: int, + classes_y: np.ndarray = None, + freqs_y: np.ndarray = None, + freqs_y_safe: np.ndarray = None, + use_gpu: bool = True, + cached_MIs: dict = None, + cached_confident_MIs: dict = None, + cached_cond_MIs: dict = None, + entropy_cache: dict = None, + mrmr_relevance_algo: str = "fleuret", + mrmr_redundancy_algo: str = "fleuret", + max_veteranes_interactions_order: int = 1, + dtype=np.int32, + max_runtime_mins: int = None, + start_time: float = None, + min_relevance_gain: float = None, + verbose: int = 1, + ndigits: int = 5, +) -> None: + + best_gain = -LARGE_CONST + best_candidate = None + expected_gains = {} + + from pyutilz.logginglib import init_logging + + global logger + logger = init_logging(default_caller_name="scalping.py", format="%(asctime)s - %(levelname)s - %(funcName)s-line:%(lineno)d - %(message)s") + + # if verbose: logger.info("In evaluate_candidates") + + entropy_cache_dict = numba.typed.Dict.empty( + key_type=types.unicode_type, + value_type=types.float64, + ) + python_dict_2_numba_dict(python_dict=entropy_cache, numba_dict=entropy_cache_dict) + + cached_cond_MIs_dict = numba.typed.Dict.empty( + key_type=types.unicode_type, + value_type=types.float64, + ) + python_dict_2_numba_dict(python_dict=cached_cond_MIs, numba_dict=cached_cond_MIs_dict) + + classes_y_safe = classes_y.copy() + # np.random.seed(int.from_bytes(os.urandom(4), byteorder='little')) + + for cand_idx, X, nexisting in (candidates_pbar := tqdmu(workload, leave=False, desc="Thread Candidates")): + + # if verbose: logger.info(f"Evaluating cand {X}") + + current_gain, sink_reasons = evaluate_candidate( + cand_idx=cand_idx, + X=X, + y=y, + nexisting=nexisting, + best_gain=best_gain, + factors_data=factors_data, + factors_nbins=factors_nbins, + factors_names=factors_names, + classes_y=classes_y, + classes_y_safe=classes_y_safe, + freqs_y=freqs_y, + use_gpu=use_gpu, + freqs_y_safe=freqs_y_safe, + partial_gains=partial_gains, + baseline_npermutations=baseline_npermutations, + mrmr_relevance_algo=mrmr_relevance_algo, + mrmr_redundancy_algo=mrmr_redundancy_algo, + max_veteranes_interactions_order=max_veteranes_interactions_order, + expected_gains=expected_gains, + selected_vars=selected_vars, + cached_MIs=cached_MIs, + cached_confident_MIs=cached_confident_MIs, + cached_cond_MIs=cached_cond_MIs_dict, + entropy_cache=entropy_cache_dict, + verbose=verbose, + ndigits=ndigits, + dtype=dtype, + ) + + # if verbose: logger.info(f"X={X}, gain={current_gain}") + + best_gain, best_candidate, run_out_of_time = handle_best_candidate( + current_gain=current_gain, + best_gain=best_gain, + X=X, + best_candidate=best_candidate, + factors_names=factors_names, + verbose=verbose, + ndigits=ndigits, + max_runtime_mins=max_runtime_mins, + start_time=start_time, + min_relevance_gain=min_relevance_gain, + ) + + if run_out_of_time: + break + + for key, value in entropy_cache_dict.items(): + entropy_cache[key] = value + for key, value in cached_cond_MIs_dict.items(): + cached_cond_MIs[key] = value + + if verbose: + logger.info(f"Batch finished.") + + return best_gain, best_candidate, partial_gains, expected_gains, cached_MIs, cached_cond_MIs, entropy_cache + + +def handle_best_candidate( + current_gain: float, + best_gain: float, + X: Sequence, + best_candidate: Sequence, + factors_names: list, + verbose: int = 1, + ndigits: int = 5, + max_runtime_mins: int = None, + start_time: float = None, + min_relevance_gain: float = None, +): + # --------------------------------------------------------------------------------------------------------------- + # Save best known candidate, to be able to use early stopping + # --------------------------------------------------------------------------------------------------------------- + + run_out_of_time = False + + if current_gain > best_gain: + best_candidate = X + best_gain = current_gain + if verbose > 1: + logger.info( + f"\t{get_candidate_name(best_candidate,factors_names=factors_names)} is so far the best candidate with best_gain={best_gain:.{ndigits}f}" + ) + else: + if min_relevance_gain and verbose > 1 and current_gain > min_relevance_gain: + logger.info(f"\t\t{get_candidate_name(X,factors_names=factors_names)} current_gain={current_gain:.{ndigits}f}") + + if max_runtime_mins and not run_out_of_time: + run_out_of_time = (timer() - start_time) > max_runtime_mins * 60 + + return best_gain, best_candidate, run_out_of_time + + +@njit() +def evaluate_gain( + current_gain: float, + last_checked_k: int, + direct_gain: float, + X: Sequence[int], + y: Sequence[int], + nexisting: int, + best_gain: float, + factors_data: np.ndarray, + factors_nbins: np.ndarray, + selected_vars: list, + mrmr_relevance_algo: str = "fleuret", + mrmr_redundancy_algo: str = "fleuret", + max_veteranes_interactions_order: int = 2, + extra_knowledge_multipler: float = -1.0, + sink_threshold: float = -1.0, + entropy_cache: dict = None, + cached_cond_MIs: dict = None, + can_use_x_cache=False, + can_use_y_cache=False, + dtype=np.int32, + confidence_mode: bool = False, +) -> tuple: + + positive_mode = False + stopped_early = False + sink_reasons = None + + k = 0 + for interactions_order in range(max_veteranes_interactions_order): + combs = generate_combinations_recursive_njit(np.array(selected_vars, dtype=np.int32), interactions_order + 1)[::-1] + # if X==(425,): logger.info(f"\t combs={combs}") + + for Z in combs: + + if k > last_checked_k: + if confidence_mode and count_cand_nbins(Z, factors_nbins) > MAX_CONFIRMATION_CAND_NBINS: + additional_knowledge = 0.0 # this is needed to skip checking agains hi cardinality approved factors + else: + if mrmr_relevance_algo == "fleuret": + + # --------------------------------------------------------------------------------------------------------------- + # additional_knowledge = I (X ;Y | Z ) = H(X, Z) + H(Y, Z) - H(Z) - H(X, Y, Z) + # I (X,Z) would be entropy_x + entropy_z - entropy_xz. + # --------------------------------------------------------------------------------------------------------------- + + key_found = False + if not confidence_mode: + key = arr2str(X) + "_" + arr2str(Z) + if key in cached_cond_MIs: + additional_knowledge = cached_cond_MIs[key] + # if X==(425,): logger.info(f"\t additional_knowledge from {Z} found to be {additional_knowledge}, k={k}, last_checked_k={last_checked_k}") + key_found = True + + if not key_found: + + additional_knowledge = conditional_mi( + factors_data=factors_data, + x=X, + y=y, + z=Z, + var_is_nominal=None, + factors_nbins=factors_nbins, + entropy_cache=entropy_cache, + can_use_x_cache=can_use_x_cache, + can_use_y_cache=can_use_y_cache, + dtype=dtype, + ) + + if nexisting > 0: + additional_knowledge = additional_knowledge ** (nexisting + 1) + # else: + # if len(X) > 1: + # additional_knowledge = additional_knowledge ** (len(X) + 1) + + if not confidence_mode: + cached_cond_MIs[key] = additional_knowledge + + # if X==(425,): logger.info(f"\t additional_knowledge from {Z}={additional_knowledge}, k={k}, last_checked_k={last_checked_k}") + + # --------------------------------------------------------------------------------------------------------------- + # Account for possible extra knowledge from conditioning on Z? + # that must update best_gain globally. log such cases. Note that we do not guarantee finding them in order, + # but they are too precious to ignore. Adding this will also allow to skip higher order interactions + # containing all of already approved candidates. + # --------------------------------------------------------------------------------------------------------------- + + if extra_knowledge_multipler > 0 and additional_knowledge > direct_gain * extra_knowledge_multipler: + bwarn = False + if not positive_mode: + current_gain = additional_knowledge + positive_mode = True + bwarn = True + else: + # rare chance that a candidate has many excellent relationships + if additional_knowledge > current_gain: + current_gain = additional_knowledge + bwarn = True + + # if bwarn: + # if verbose: + # if current_gain > best_gain: + # logger.info( + # f"\tCandidate {get_candidate_name(X,factors_names=factors_names)} together with factor {get_candidate_name(Z,factors_names=factors_names)} has synergetic influence {additional_knowledge:{ndigits}f} (direct MI={direct_gain:{ndigits}f})" + # ) + + if not positive_mode and (additional_knowledge < current_gain): + + current_gain = additional_knowledge + + if best_gain is not None and current_gain <= best_gain: + + # --------------------------------------------------------------------------------------------------------------- + # No point checking other Zs, 'cause current_gain already won't be better than the best_gain + # (if best_gain was estimated confidently, which we'll check at the end.) + # --------------------------------------------------------------------------------------------------------------- + + # let's also fix what Z caused X (the most) to sink + + if sink_threshold > -1 and current_gain < sink_threshold: + sink_reasons = Z + + stopped_early = True + return stopped_early, current_gain, k, sink_reasons + k += 1 + + return stopped_early, current_gain, k - 1, sink_reasons + + +def evaluate_candidate( + cand_idx: int, + X: Sequence[int], + y: Sequence[int], + nexisting: int, + best_gain: float, + factors_data: np.ndarray, + factors_nbins: np.ndarray, + factors_names: Sequence[str], + expected_gains: np.ndarray, + partial_gains: dict, + selected_vars: list, + baseline_npermutations: int, + classes_y: np.ndarray = None, + classes_y_safe: np.ndarray = None, + freqs_y: np.ndarray = None, + freqs_y_safe: np.ndarray = None, + use_gpu: bool = True, + cached_MIs: dict = None, + cached_confident_MIs: dict = None, + cached_cond_MIs: dict = None, + entropy_cache: dict = None, + mrmr_relevance_algo: str = "fleuret", + mrmr_redundancy_algo: str = "fleuret", + max_veteranes_interactions_order: int = 2, + extra_knowledge_multipler: float = -1.0, + sink_threshold: float = -1.0, + dtype=np.int32, + verbose: int = 1, + ndigits: int = 5, +) -> None: + # logger.info("In evaluate_candidate") + sink_reasons = set() + + # --------------------------------------------------------------------------------------------------------------- + # Is this candidate any good for target 1-vs-1? + # --------------------------------------------------------------------------------------------------------------- + + if X in cached_confident_MIs: # use cached_confident_MIs first here as they are more reliable. (but not fill them) + direct_gain = cached_confident_MIs[X] + else: + if X in cached_MIs: + direct_gain = cached_MIs[X] + else: + if use_gpu: + direct_gain, _ = mi_direct_gpu( + factors_data, + x=X, + y=y, + factors_nbins=factors_nbins, + classes_y=classes_y, + classes_y_safe=classes_y_safe, + freqs_y=freqs_y, + freqs_y_safe=freqs_y_safe, + min_nonzero_confidence=1.0, + npermutations=baseline_npermutations, + dtype=dtype, + ) + else: + # logger.info("Computing mi_direct") + direct_gain, _ = mi_direct( + factors_data, + x=X, + y=y, + factors_nbins=factors_nbins, + classes_y=classes_y, + classes_y_safe=classes_y_safe, + freqs_y=freqs_y, + min_nonzero_confidence=1.0, + npermutations=baseline_npermutations, + dtype=dtype, + ) + # logger.info("Computed mi_direct") + cached_MIs[X] = direct_gain + + if direct_gain > 0: + if selected_vars: + + # --------------------------------------------------------------------------------------------------------------- + # Some factors already selected. + # best gain from including X is the minimum of I (X ;Y | Z ) over every Z in already selected_vars. + # but imaging some variable is correlated to every real predictor plus has random noise. It's real value is zero. + # only computing I (X ;Y | Z ) will still leave significant impact. but if we sum I(X,Z) over Zs we'll see it shares + # all its knowledge with the rest of factors and has no value by itself. But to see that, we must already have all real factors included in S. + # otherwise, such 'connected-to-all' trash variables will dominate the scene. So how to handle them? + # Solution is to compute sum(X,Z) not only at the step of adding Z, but to repeat this procedure for all Zs once new X is added. + # Maybe some Zs will render useless by adding that new X. + # --------------------------------------------------------------------------------------------------------------- + + if cand_idx in partial_gains: + current_gain, last_checked_k = partial_gains[cand_idx] + # if X==(425,): logger.info(f"\t cand_idx in partial_gains: {current_gain, last_checked_k}") + if best_gain is not None and (current_gain <= best_gain): + return current_gain, sink_reasons + else: + current_gain = LARGE_CONST + last_checked_k = -1 + + stopped_early, current_gain, k, sink_reasons = evaluate_gain( + current_gain=current_gain, + last_checked_k=last_checked_k, + direct_gain=direct_gain, + X=X, + y=y, + nexisting=nexisting, + best_gain=best_gain, + factors_data=factors_data, + factors_nbins=factors_nbins, + selected_vars=selected_vars, + mrmr_relevance_algo=mrmr_relevance_algo, + mrmr_redundancy_algo=mrmr_redundancy_algo, + max_veteranes_interactions_order=max_veteranes_interactions_order, + extra_knowledge_multipler=extra_knowledge_multipler, + sink_threshold=sink_threshold, + entropy_cache=entropy_cache, + cached_cond_MIs=cached_cond_MIs, + can_use_x_cache=True, + can_use_y_cache=True, + ) + # if X==(425,): logger.info(f"\t stopped_early, current_gain, k, sink_reasons={stopped_early, current_gain, k, sink_reasons}") + + partial_gains[cand_idx] = current_gain, k + if not stopped_early: # there was no break. current_gain computed fully. this line was (and most likely should be) commented out. + expected_gains[cand_idx] = current_gain + else: + # no factors selected yet. current_gain is just direct_gain + current_gain = direct_gain + expected_gains[cand_idx] = current_gain + else: + current_gain = 0 + + return current_gain, sink_reasons + + +def test(a): + logger.info("test") + return 0 + + +# @njit() +def screen_predictors( + # factors + factors_data: np.ndarray, + factors_nbins: Sequence[int], + factors_names: Sequence[str] = None, + factors_names_to_use: Sequence[str] = None, + factors_to_use: Sequence[int] = None, + # targets + targets_data: np.ndarray = None, + targets_nbins: Sequence[int] = None, + y: Sequence[int] = None, + # algorithm + mrmr_relevance_algo: str = "fleuret", + mrmr_redundancy_algo: str = "fleuret", + reduce_gain_on_subelement_chosen: bool = True, + # performance + extra_x_shuffling: bool = True, + dtype=np.int32, + random_seed: int = None, + use_gpu: bool = False, + n_workers: int = 1, + # confidence + min_occupancy: int = None, + min_nonzero_confidence: float = 0.99, + full_npermutations: int = 1_000, + baseline_npermutations: int = 100, + # stopping conditions + min_relevance_gain: float = 0.00001, + max_consec_unconfirmed: int = 30, + max_runtime_mins: int = None, + interactions_min_order: int = 1, + interactions_max_order: int = 1, + interactions_order_reversed: bool = False, + max_veteranes_interactions_order: int = 1, + only_unknown_interactions: bool = False, + # verbosity and formatting + verbose: int = 1, + ndigits: int = 5, + parallel_kwargs=dict(max_nbytes=MAX_JOBLIB_NBYTES), +) -> float: + """Finds best predictors for the target. + x must be n-x-m array of integers (ordinal encoded) + Parameters: + full_npermutations: when computing every MI, repeat calculations with randomly shuffled indices that many times + min_nonzero_confidence: if in random permutation tests this or higher % of cases had worse current_gain than original, current_gain value is considered valid, otherwise, it's set to zero. + Returns: + 1) best set of non-redundant single features influencing the target + 2) subsets of size 2..interactions_max_order influencing the target. Such subsets will be candidates for predictors and OtherVarsEncoding. + 3) all 1-vs-1 influencers (not necessarily in mRMR) + Parameters: + only_unknown_interactions: True for speed, False for completeness of higher order interactions discovery. + verbose: int 1=log only important info,>1=also log additional details + mrmr_relevance_algo:str + "fleuret": max(min(I(X,Y|Z)),max(I(X,Y|Z)-I(X,Y))) Possible to use n-way interactions here. + "pld": I(X,Y) + mrmr_redundancy_algo:str + "fleuret": 0 ('cause redundancy already accounted for) + "pld_max": max(I(veterane,cand)) Possible to use n-way interactions here. + "pld_mean": mean(I(veterane,cand)) Possible to use n-way interactions here. + """ + # --------------------------------------------------------------------------------------------------------------- + # Input checks + # --------------------------------------------------------------------------------------------------------------- + + assert mrmr_relevance_algo in ("fleuret", "pld") + assert mrmr_redundancy_algo in ("fleuret", "pld_max", "pld_mean") + + assert len(factors_data) >= 10 + if targets_data is None: + targets_data = factors_data + else: + assert len(factors_data) == len(targets_data) + + if targets_nbins is None: + targets_nbins = factors_nbins + + assert targets_data.shape[1] == len(targets_nbins) + assert factors_data.shape[1] == len(factors_nbins) + + if len(factors_names) == 0: + factors_names = ["F" + str(i) for i in range(len(factors_data))] + else: + assert factors_data.shape[1] == len(factors_names) + + # warn if inputs are identical to targets + if factors_data.shape == targets_data.shape: + if np.shares_memory(factors_data, targets_data): + if factors_to_use is None and factors_names_to_use is None: + if verbose > 1: + logger.info( + "factors_data and targets_data share the same memory. factors_to_use will be determined automatically to not contain any target columns." + ) + x = set(range(factors_data.shape[1])) - set(y) + else: + if factors_to_use is not None: + x = set(factors_to_use) - set(y) + if verbose > 1: + logger.info(f"Using only {len(factors_to_use):_} predefined factors: {factors_to_use}") + else: + x = [i for i, col_name in enumerate(factors_names) if col_name in factors_names_to_use and i != y] + if verbose > 1: + logger.info(f"Using only {len(factors_names_to_use):_} predefined factors: {factors_names_to_use}") + else: + + assert not set(y).issubset(set(x)) + + # --------------------------------------------------------------------------------------------------------------- + # Inits + # --------------------------------------------------------------------------------------------------------------- + + start_time = timer() + run_out_of_time = False + + if random_seed is not None: + np.random.seed(random_seed) + cp.random.seed(random_seed) + set_numba_random_seed(random_seed) + + max_failed = int(full_npermutations * (1 - min_nonzero_confidence)) + if max_failed <= 1: + max_failed = 1 + + selected_interactions_vars = [] + selected_vars = [] # stores just indices. can't use set 'cause the order is important for efficient computing + predictors = [] # stores more details. + + cached_MIs = dict() + # cached_cond_MIs = dict() + cached_confident_MIs = dict() + entropy_cache = numba.typed.Dict.empty( + key_type=types.unicode_type, + value_type=types.float64, + ) + cached_cond_MIs = numba.typed.Dict.empty( + key_type=types.unicode_type, + value_type=types.float64, + ) + + data_copy = factors_data.copy() + + classes_y, freqs_y, _ = merge_vars(factors_data=factors_data, vars_indices=y, var_is_nominal=None, factors_nbins=factors_nbins, dtype=dtype) + classes_y_safe = classes_y.copy() + + if use_gpu: + classes_y_safe = cp.asarray(classes_y.astype(np.int32)) + freqs_y_safe = cp.asarray(freqs_y) + else: + freqs_y_safe = None + + if n_workers and n_workers > 1: + # global classes_y_memmap + # classes_y_memmap = mem_map_array(obj=classes_y, file_name="classes_y", mmap_mode="r") + if verbose: + logger.info("Starting parallel pool...") + workers_pool = Parallel(n_jobs=n_workers, **parallel_kwargs) + workers_pool(delayed(test)(i) for i in range(n_workers)) + else: + workers_pool = None + + subsets = range(interactions_min_order, interactions_max_order + 1) + if interactions_order_reversed: + subsets = subsets[::-1] + + if verbose: + logger.info( + f"Starting work with full_npermutations={full_npermutations:_}, min_nonzero_confidence={min_nonzero_confidence:.{ndigits}f}, max_failed={max_failed:_}" + ) + + for interactions_order in (subsets_pbar := tqdmu(subsets, desc="Interactions order", leave=False)): + + if run_out_of_time: + break + subsets_pbar.set_description(f"{interactions_order}-way interactions") + + # --------------------------------------------------------------------------------------------------------------- + # Generate candidates + # --------------------------------------------------------------------------------------------------------------- + + candidates = [tuple(el) for el in combinations(x, interactions_order)] + + # --------------------------------------------------------------------------------------------------------------- + # Subset level inits + # --------------------------------------------------------------------------------------------------------------- + + total_disproved = 0 + total_checked = 0 + partial_gains = {} + added_candidates = set() + failed_candidates = set() + nconsec_unconfirmed = 0 + + for n_confirmed_predictors in (predictors_pbar := tqdmu(range(len(candidates)), leave=False, desc="Confirmed predictors")): + # if n_confirmed_predictors>4: n_jobs=1 + if run_out_of_time: + break + + # --------------------------------------------------------------------------------------------------------------- + # Find candidate X with the highest current_gain given already selected factors + # --------------------------------------------------------------------------------------------------------------- + + best_candidate = None + best_gain = min_relevance_gain - 1 + expected_gains = np.zeros(len(candidates), dtype=np.float64) + + while True: # confirmation loop (by random permutations) + + if verbose and len(selected_vars) < MAX_ITERATIONS_TO_TRACK: + eval_start = timer() + + feasible_candidates = [] + for cand_idx, X in enumerate(candidates): + skip, nexisting = should_skip_candidate( + cand_idx=cand_idx, + X=X, + interactions_order=interactions_order, + only_unknown_interactions=only_unknown_interactions, + failed_candidates=failed_candidates, + added_candidates=added_candidates, + expected_gains=expected_gains, + selected_vars=selected_vars, + selected_interactions_vars=selected_interactions_vars, + ) + if skip: + continue + + feasible_candidates.append((cand_idx, X, nexisting if reduce_gain_on_subelement_chosen else 0)) + + if n_workers and n_workers > 1 and len(feasible_candidates) > NMAX_NONPARALLEL_ITERS: + + res = workers_pool( + delayed(evaluate_candidates)( + workload=workload, + y=y, + best_gain=best_gain, + factors_data=factors_data, + factors_nbins=factors_nbins, + factors_names=factors_names, + classes_y=classes_y, + freqs_y=freqs_y, + use_gpu=use_gpu, + freqs_y_safe=freqs_y_safe, + partial_gains=partial_gains, + baseline_npermutations=baseline_npermutations, + mrmr_relevance_algo=mrmr_relevance_algo, + mrmr_redundancy_algo=mrmr_redundancy_algo, + max_veteranes_interactions_order=max_veteranes_interactions_order, + selected_vars=selected_vars, + cached_MIs=cached_MIs, + cached_confident_MIs=cached_confident_MIs, + cached_cond_MIs=dict(cached_cond_MIs), + entropy_cache=dict(entropy_cache), + max_runtime_mins=max_runtime_mins, + start_time=start_time, + min_relevance_gain=min_relevance_gain, + verbose=verbose, + ndigits=ndigits, + ) + for workload in split_list_into_chunks(feasible_candidates, max(1, len(feasible_candidates) // n_workers)) + ) + + for ( + worker_best_gain, + worker_best_candidate, + worker_partial_gains, + worker_expected_gains, + worker_cached_MIs, + worker_cached_cond_MIs, + worker_entropy_cache, + ) in res: + + if worker_best_gain > best_gain: + best_candidate = worker_best_candidate + best_gain = worker_best_gain + + # sync caches + for local_storage, global_storage in [ + (worker_expected_gains, expected_gains), + (worker_cached_MIs, cached_MIs), + (worker_cached_cond_MIs, cached_cond_MIs), + (worker_entropy_cache, entropy_cache), + ]: + for key, value in local_storage.items(): + global_storage[key] = value + + for cand_idx, (worker_current_gain, worker_z_idx) in worker_partial_gains.items(): + if cand_idx in partial_gains: + current_gain, z_idx = partial_gains[cand_idx] + else: + z_idx = -2 + if worker_z_idx > z_idx: + partial_gains[cand_idx] = (worker_current_gain, worker_z_idx) + + if max_runtime_mins and not run_out_of_time: + run_out_of_time = (timer() - start_time) > max_runtime_mins * 60 + if run_out_of_time: + logging.info(f"Time limit exhausted. Finalizing the search...") + break + + else: + for cand_idx, X, nexisting in (candidates_pbar := tqdmu(feasible_candidates, leave=False, desc="Candidates")): + + # tmp_idx=X[0] + # print(X,factors_nbins[tmp_idx],factors_names[tmp_idx]) + # from time import sleep + # sleep(5) + + current_gain, sink_reasons = evaluate_candidate( + cand_idx=cand_idx, + X=X, + y=y, + nexisting=nexisting, + best_gain=best_gain, + factors_data=factors_data, + factors_nbins=factors_nbins, + factors_names=factors_names, + classes_y=classes_y, + classes_y_safe=classes_y_safe, + freqs_y=freqs_y, + use_gpu=use_gpu, + freqs_y_safe=freqs_y_safe, + partial_gains=partial_gains, + baseline_npermutations=baseline_npermutations, + mrmr_relevance_algo=mrmr_relevance_algo, + mrmr_redundancy_algo=mrmr_redundancy_algo, + max_veteranes_interactions_order=max_veteranes_interactions_order, + expected_gains=expected_gains, + selected_vars=selected_vars, + cached_MIs=cached_MIs, + cached_confident_MIs=cached_confident_MIs, + cached_cond_MIs=cached_cond_MIs, + entropy_cache=entropy_cache, + verbose=verbose, + ndigits=ndigits, + ) + + best_gain, best_candidate, run_out_of_time = handle_best_candidate( + current_gain=current_gain, + best_gain=best_gain, + X=X, + best_candidate=best_candidate, + factors_names=factors_names, + max_runtime_mins=max_runtime_mins, + start_time=start_time, + min_relevance_gain=min_relevance_gain, + verbose=verbose, + ndigits=ndigits, + ) + + if run_out_of_time: + if verbose: + logging.info(f"Time limit exhausted. Finalizing the search...") + break + + if verbose and len(selected_vars) < MAX_ITERATIONS_TO_TRACK: + logger.info(f"evaluate_candidates took {timer() - eval_start:.1f} sec.") + + if best_gain < min_relevance_gain: + if verbose: + logger.info("Minimum expected gain reached or no candidates to check anymore.") + break # exit confirmation while loop + + # --------------------------------------------------------------------------------------------------------------- + # Now need to confirm best expected gain with a permutation test + # --------------------------------------------------------------------------------------------------------------- + + cand_confirmed = False + any_cand_considered = False + for n, next_best_candidate_idx in enumerate(np.argsort(expected_gains)[::-1]): + next_best_gain = expected_gains[next_best_candidate_idx] + # logger.info(f"{n}, {next_best_gain}, {min_relevance_gain}") + if next_best_gain >= min_relevance_gain: # only can consider here candidates fully checked against every Z + + X = candidates[next_best_candidate_idx] + + # --------------------------------------------------------------------------------------------------------------- + # For cands other than the top one, if best partial gain <= next_best_gain, we can proceed with confirming next_best_gain. + # else we have to recompute partial gains + # --------------------------------------------------------------------------------------------------------------- + + if n > 0: + best_partial_gain, best_key = find_best_partial_gain( + partial_gains=partial_gains, + failed_candidates=failed_candidates, + added_candidates=added_candidates, + candidates=candidates, + selected_vars=selected_vars, + ) + + if best_partial_gain > next_best_gain: + best_gain = next_best_gain + if verbose > 1: + print( + "Have no best_candidate anymore. Need to recompute partial gains. best_partial_gain of candidate", + get_candidate_name(candidates[best_key], factors_names=factors_names), + "was", + best_partial_gain, + ) + break # out of best candidates confirmation, to retry all cands evaluation + + any_cand_considered = True + + if full_npermutations: + + if verbose > 1: + logger.info( + f"confirming candidate {get_candidate_name(X, factors_names=factors_names)}, next_best_gain={next_best_gain:.{ndigits}f}" + ) + + # --------------------------------------------------------------------------------------------------------------- + # Compute confidence by bootstrap + # --------------------------------------------------------------------------------------------------------------- + + total_checked += 1 + if X in cached_confident_MIs: + bootstrapped_gain, confidence = cached_confident_MIs[X] + else: + if use_gpu: + bootstrapped_gain, confidence = mi_direct_gpu( + factors_data, + x=X, + y=y, + factors_nbins=factors_nbins, + classes_y=classes_y, + freqs_y=freqs_y, + freqs_y_safe=freqs_y_safe, + classes_y_safe=classes_y_safe, + min_nonzero_confidence=min_nonzero_confidence, + npermutations=full_npermutations, + ) + else: + if verbose and len(selected_vars) < MAX_ITERATIONS_TO_TRACK: + eval_start = timer() + bootstrapped_gain, confidence = mi_direct( + factors_data, + x=X, + y=y, + factors_nbins=factors_nbins, + classes_y=classes_y, + freqs_y=freqs_y, + classes_y_safe=classes_y_safe, + min_nonzero_confidence=min_nonzero_confidence, + npermutations=full_npermutations, + n_workers=n_workers, + workers_pool=workers_pool, + parallel_kwargs=parallel_kwargs, + ) + if verbose and len(selected_vars) < MAX_ITERATIONS_TO_TRACK: + logger.info(f"mi_direct bootstrapped eval took {timer() - eval_start:.1f} sec.") + cached_confident_MIs[X] = bootstrapped_gain, confidence + else: + if X in cached_confident_MIs: + bootstrapped_gain, confidence = cached_confident_MIs[X] + else: + bootstrapped_gain, confidence = next_best_gain, 1.0 + + if full_npermutations and bootstrapped_gain > 0 and selected_vars: # additional check of Fleuret criteria + + if count_cand_nbins(X, factors_nbins) <= MAX_CONFIRMATION_CAND_NBINS: + + skip_cand = [(subel in selected_vars) for subel in X] + nexisting = sum(skip_cand) + + # --------------------------------------------------------------------------------------------------------------- + # external bootstrapped recheck. is minimal MI of candidate X with Y given all current Zs THAT BIG as next_best_gain? + # --------------------------------------------------------------------------------------------------------------- + + if verbose and len(selected_vars) < MAX_ITERATIONS_TO_TRACK: + eval_start = timer() + + if n_workers and n_workers > 1 and full_npermutations > NMAX_NONPARALLEL_ITERS: + bootstrapped_gain, confidence, parallel_entropy_cache = get_fleuret_criteria_confidence_parallel( + data_copy=data_copy, + factors_nbins=factors_nbins, + x=X, + y=y, + selected_vars=selected_vars, + bootstrapped_gain=next_best_gain, + npermutations=full_npermutations, + max_failed=max_failed, + nexisting=nexisting, + mrmr_relevance_algo=mrmr_relevance_algo, + mrmr_redundancy_algo=mrmr_redundancy_algo, + max_veteranes_interactions_order=max_veteranes_interactions_order, + cached_cond_MIs=cached_cond_MIs, + entropy_cache=entropy_cache, + extra_x_shuffling=extra_x_shuffling, + n_workers=n_workers, + workers_pool=workers_pool, + parallel_kwargs=parallel_kwargs, + ) + for key, value in parallel_entropy_cache.items(): + entropy_cache[key] = value + else: + nfailed, nchecked = get_fleuret_criteria_confidence( + data_copy=data_copy, + factors_nbins=factors_nbins, + x=X, + y=y, + selected_vars=selected_vars, + bootstrapped_gain=next_best_gain, + npermutations=full_npermutations, + max_failed=max_failed, + nexisting=nexisting, + mrmr_relevance_algo=mrmr_relevance_algo, + mrmr_redundancy_algo=mrmr_redundancy_algo, + max_veteranes_interactions_order=max_veteranes_interactions_order, + cached_cond_MIs=cached_cond_MIs, + entropy_cache=entropy_cache, + extra_x_shuffling=extra_x_shuffling, + ) + # logger.info(f"nfailed={nfailed}, nchecked={nchecked}") + confidence = 1 - nfailed / nchecked + if nfailed >= max_failed: + bootstrapped_gain = 0.0 + + if verbose and len(selected_vars) < MAX_ITERATIONS_TO_TRACK: + logger.info(f"get_fleuret_criteria_confidence bootstrapped eval took {timer() - eval_start:.1f} sec.") + + # --------------------------------------------------------------------------------------------------------------- + # Report this particular best candidate + # --------------------------------------------------------------------------------------------------------------- + + if bootstrapped_gain > 0: + + nconsec_unconfirmed = 0 + + # --------------------------------------------------------------------------------------------------------------- + # Bad confidence can make us consider other candidates! + # --------------------------------------------------------------------------------------------------------------- + + next_best_gain = next_best_gain * confidence + expected_gains[next_best_candidate_idx] = next_best_gain + + best_partial_gain, best_key = find_best_partial_gain( + partial_gains=partial_gains, + failed_candidates=failed_candidates, + added_candidates=added_candidates, + candidates=candidates, + selected_vars=selected_vars, + skip_indices=(next_best_candidate_idx,), + ) + if best_partial_gain > next_best_gain: + if verbose > 1: + logger.info( + f"\t\tCandidate's lowered confidence {confidence} requires re-checking other candidates, as now its expected gain is only {next_best_gain:.{ndigits}f}, vs {best_partial_gain:.{ndigits}f}, of {get_candidate_name(candidates[best_key], factors_names=factors_names)}" + ) + break # out of best candidates confirmation, to retry all cands evaluation + else: + cand_confirmed = True + if full_npermutations: + if verbose > 1: + logger.info(f"\t\tconfirmed with confidence {confidence:.{ndigits}f}") + break # out of best candidates confirmation, to add candidate to the list, and go to more candidates + else: + expected_gains[next_best_candidate_idx] = 0.0 + failed_candidates.add(next_best_candidate_idx) + if verbose > 1: + logger.info(f"\t\tconfirmation failed with confidence {confidence:.{ndigits}f}") + + nconsec_unconfirmed += 1 + total_disproved += 1 + if max_consec_unconfirmed and (nconsec_unconfirmed > max_consec_unconfirmed): + if verbose: + logger.info(f"Maximum consecutive confirmation failures reached.") + break # out of best candidates confirmation, to finish the level + + else: # next_best_gain = 0 + break # nothing wrong, just retry all cands evaluation + + # --------------------------------------------------------------------------------------------------------------- + # Let's act upon results of the permutation test + # --------------------------------------------------------------------------------------------------------------- + + if cand_confirmed: + added_candidates.add(next_best_candidate_idx) # so it won't be selected again + best_candidate = X + best_gain = next_best_gain + break # exit confirmation while loop + else: + if not any_cand_considered: + best_gain = min_relevance_gain - 1 + if verbose: + logger.info("No more candidates to confirm.") + break # exit confirmation while loop + else: + best_gain = min_relevance_gain - 1 + if max_consec_unconfirmed and (nconsec_unconfirmed > max_consec_unconfirmed): + break # exit confirmation while loop + else: + pass # retry all cands evaluation + + # --------------------------------------------------------------------------------------------------------------- + # Add best candidate to the list, if criteria are met, or proceed to the next interactions_order + # --------------------------------------------------------------------------------------------------------------- + + if best_gain >= (min_relevance_gain if interactions_order == 1 else min_relevance_gain ** (1 / (interactions_order + 1))): + for var in best_candidate: + if var not in selected_vars: + selected_vars.append(var) + if interactions_order > 1: + selected_interactions_vars.append(var) + cand_name = get_candidate_name(best_candidate, factors_names=factors_names) + + res = {"name": cand_name, "indices": best_candidate, "gain": best_gain} + if full_npermutations: + res["confidence"] = confidence + predictors.append(res) + + if verbose: + mes = f"Added new predictor {cand_name} to the list with expected gain={best_gain:.{ndigits}f}" + if full_npermutations: + mes += f" and confidence={confidence:.3f}" + logger.info(mes) + + else: + if verbose: + if total_checked > 0: + details = f" Total candidates disproved: {total_disproved:_}/{total_checked:_} ({total_disproved*100/total_checked:.2f}%)" + else: + details = "" + logger.info(f"Can't add anything valuable anymore for interactions_order={interactions_order}.{details}") + predictors_pbar.total = len(candidates) + predictors_pbar.close() + break + + # postprocess_candidates(selected_vars) + # print(caching_hits_xyz, caching_hits_z, caching_hits_xz, caching_hits_yz) + if verbose: + logger.info(f"Finished.") + + any_influencing = set() + for vars_combination, (bootstrapped_gain, confidence) in cached_confident_MIs.items(): + if bootstrapped_gain > 0: + any_influencing.update(set(vars_combination)) + + """Выбрать группы/кластера скоррелированных факторов. Вместо использования 1 самого крутого, рассмотреть средние от всех + отброшенных факторов, имеющих высокое прямое direct_MI с таргетом, но близкое к 0 additional_knowledge с каждым + "победившим" фактором, проверить, не могут ли они усилить свой победивший фактор через ансамблирование. Усиление в смысле + среднего и вариативности MI с таргетом на бутстрепе подвыборок? + + key = arr2str(X) + "_" + arr2str(Z) + if key in cached_cond_MIs: + additional_knowledge = cached_cond_MIs[key] + """ + return selected_vars, predictors, any_influencing, entropy_cache, cached_MIs, cached_confident_MIs, cached_cond_MIs, classes_y, classes_y_safe, freqs_y + + +@njit() +def count_cand_nbins(X, factors_nbins) -> int: + sum_cand_nbins = 0 + for factor in X: + sum_cand_nbins += factors_nbins[factor] + return sum_cand_nbins + + +def find_best_partial_gain( + partial_gains: dict, failed_candidates: set, added_candidates: set, candidates: list, selected_vars: list, skip_indices: tuple = () +) -> float: + best_partial_gain = -LARGE_CONST + best_key = None + for key, value in partial_gains.items(): + if (key not in failed_candidates) and (key not in added_candidates) and (key not in skip_indices): + skip_cand = False + for subel in candidates[key]: + if subel in selected_vars: + skip_cand = True # the subelement or var itself is already selected + break + if skip_cand: + continue + partial_gain, _ = value + if partial_gain > best_partial_gain: + best_partial_gain = partial_gain + best_key = key + return best_partial_gain, best_key + + +def postprocess_candidates( + selected_vars: list, + factors_data: np.ndarray, + y: np.ndarray, + factors_nbins: np.ndarray, + classes_y: np.ndarray = None, + freqs_y: np.ndarray = None, + classes_y_safe: np.ndarray = None, + min_nonzero_confidence: float = 0.99999, + npermutations: int = 10_000, + interactions_max_order: int = 1, + ensure_target_influence: bool = True, + dtype=np.int32, + verbose: bool = True, + ndigits: int = 4, +): + """Post-analysis of prescreened candidates. + + 1) repeat standard Fleuret screening process. maybe some vars will be removed when taken into account all other candidates. + 2) + 3) in the final set, compute for every factor + a) MI with every remaining predictor (and 2,3 way subsets) + + """ + # --------------------------------------------------------------------------------------------------------------- + # Repeat standard Fleuret screening process. maybe some vars will be removed when taken into account all other candidates. + # --------------------------------------------------------------------------------------------------------------- + for cand_idx, X, nexisting in (candidates_pbar := tqdmu(selected_vars, leave=False, desc="Finalizing Candidates")): + current_gain, sink_reasons = evaluate_candidate( + cand_idx=cand_idx, + X=X, + y=y, + nexisting=nexisting, + best_gain=best_gain, + factors_data=factors_data, + factors_nbins=factors_nbins, + factors_names=factors_names, + classes_y=classes_y, + classes_y_safe=classes_y_safe, + freqs_y=freqs_y, + use_gpu=use_gpu, + freqs_y_safe=freqs_y_safe, + partial_gains=partial_gains, + baseline_npermutations=baseline_npermutations, + mrmr_relevance_algo=mrmr_relevance_algo, + mrmr_redundancy_algo=mrmr_redundancy_algo, + max_veteranes_interactions_order=max_veteranes_interactions_order, + expected_gains=expected_gains, + selected_vars=selected_vars, + cached_MIs=cached_MIs, + cached_confident_MIs=cached_confident_MIs, + cached_cond_MIs=cached_cond_MIs, + entropy_cache=entropy_cache, + verbose=verbose, + ndigits=ndigits, + ) + # --------------------------------------------------------------------------------------------------------------- + # Make sure with confidence that every candidate is related to the target + # --------------------------------------------------------------------------------------------------------------- + + if ensure_target_influence: + removed = [] + for X in tqdmu(selected_vars, desc="Ensuring target influence", leave=False): + bootstrapped_mi, confidence = mi_direct( + factors_data, + x=[X], + y=y, + factors_nbins=factors_nbins, + classes_y=classes_y, + freqs_y=freqs_y, + classes_y_safe=classes_y_safe, + min_nonzero_confidence=min_nonzero_confidence, + npermutations=npermutations, + ) + if bootstrapped_mi == 0: + if verbose: + print("Factor", X, "not related to target with confidence", confidence) + removed.append(X) + selected_vars = [el for el in selected_vars if el not in removed] + + # --------------------------------------------------------------------------------------------------------------- + # Repeat Fleuret process as many times as there is candidates left. + # This time account for possible interactions (fix bug in the professor's formula). + # --------------------------------------------------------------------------------------------------------------- + + """ + Compute redundancy stats for every X + + кто связан с каким количеством других факторов, какое количество информации с ними разделяет, в % к своей собственной энтропии. + Можно даже спуститься вниз по уровню и посчитать взвешенные суммы тех же метрик для его партнёров. + Тем самым можно косвенно определить, какие фичи скорее всего просто сливные бачки, и попробовать их выбросить. В итоге мы получим: + + ценные фичи, которые ни с кем другим не связаны, кроме мусорных и таргета. они содержат уникальное знание; + потенциально мусорные X, которые связаны с множеством других, и шарят очень много общей инфы с другими факторами Z, при том, + что эти другие факторы имеют много уникального знания о таргете помимо X: sum(I(Y;Z|X))>e; + все остальные "середнячки". + """ + + entropies = {} + mutualinfos = {} + + for X in tqdmu(selected_vars, desc="Marginal stats", leave=False): + _, freqs, _ = merge_vars(factors_data=factors_data, vars_indices=[X], factors_nbins=factors_nbins, var_is_nominal=None, dtype=dtype) + factor_entropy = entropy(freqs=freqs) + entropies[X] = factor_entropy + + for a, b in tqdmu(combinations(selected_vars, 2), desc="1-way interactions", leave=False): + bootstrapped_mi, confidence = mi_direct( + factors_data, + x=[a], + y=[b], + factors_nbins=factors_nbins, + classes_y=classes_y, + freqs_y=freqs_y, + classes_y_safe=classes_y_safe, + min_nonzero_confidence=min_nonzero_confidence, + npermutations=npermutations, + ) + if bootstrapped_mi > 0: + mutualinfos[(a, b)] = bootstrapped_mi + + for y in tqdmu(selected_vars, desc="2-way interactions", leave=False): + for pair in combinations(set(selected_vars) - set([y]), 2): + bootstrapped_mi, confidence = mi_direct( + factors_data, + x=pair, + y=[y], + factors_nbins=factors_nbins, + classes_y=classes_y, + freqs_y=freqs_y, + classes_y_safe=classes_y_safe, + min_nonzero_confidence=min_nonzero_confidence, + npermutations=npermutations, + ) + if bootstrapped_mi > 0: + mutualinfos[(y, pair)] = bootstrapped_mi + + return entropies, mutualinfos + + +# ---------------------------------------------------------------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------------------------------------------------------------- + + +def create_redundant_continuous_factor( + df: pd.DataFrame, + factors: Sequence[str], + agg_func: object = np.sum, + noise_percent: float = 5.0, + dist: object = None, + dist_args: tuple = (), + name: str = None, + sep: str = "_", +) -> None: + """In a pandas dataframe, out of a few continuous factors, craft a new factor with known relationship and amount of redundancy.""" + if dist: + rvs = getattr(dist, "rvs") + assert rvs + noise = rvs(*dist_args, size=len(df)) + else: + noise = np.random.random(len(df)) + + # now the entire range of generated noise is scaled to the noise_percent of factors interaction's range + val_min, val_max = noise.min(), noise.max() + if np.isclose(val_max, val_min): + noise = np.zeros(len(noise), dtype=np.float32) + else: + noise = (noise - val_min) / (val_max - val_min) + + if not name: + name = sep.join(factors) + sep + f"{noise_percent:.0f}%{dist.name if dist else ''}noise" + + df[name] = agg_func(df[factors].values, axis=1) * (1 + (noise - 0.5) * noise_percent / 100) + + +def categorize_1d_array(vals: np.ndarray, min_ncats: int, method: str, astropy_sample_size: int, method_kwargs: dict, dtype=np.int16, nan_filler: float = 0.0): + + ordinal_encoder = OrdinalEncoder() + + # ---------------------------------------------------------------------------------------------------------------------------- + # Booleans bust become int8 + # ---------------------------------------------------------------------------------------------------------------------------- + + if vals.dtype.name != "category" and np.issubdtype(vals.dtype, np.bool_): + vals = vals.astype(np.int8) + + # ---------------------------------------------------------------------------------------------------------------------------- + # Missings are imputed using rolling median (for ts safety) + # ---------------------------------------------------------------------------------------------------------------------------- + + if pd.isna(vals).any(): + # imputer = SimpleImputer(strategy="most_frequent", add_indicator=False) + # vals = imputer.fit_transform(vals.reshape(-1, 1)) + vals = pd.Series(vals) + # vals=vals.fillna(vals.rolling(window=nan_rolling_window,min_periods=nan_rolling_min_periods).apply(lambda x: mode(x)[0])).fillna(nan_filler).values + vals = vals.fillna(nan_filler).values + + vals = vals.reshape(-1, 1) + + if vals.dtype.name != "category": + nuniques = len(np.unique(vals[: min_ncats * 10])) + if nuniques <= min_ncats: + nuniques = len(np.unique(vals)) + else: + nuniques = min_ncats + + if method == "discretizer": + bins = method_kwargs.get("n_bins") + else: + bins = method_kwargs.get("bins") + + if vals.dtype.name != "category" and nuniques > min_ncats: + if method == "discretizer": + if nuniques > bins: + discretizer = KBinsDiscretizer(**method_kwargs, encode="ordinal") + new_vals = discretizer.fit_transform(vals) + else: + new_vals = ordinal_encoder.fit_transform(vals) + else: + if method == "numpy": + + bin_edges = np.histogram_bin_edges( + vals, + bins=bins, + ) + elif method == "astropy": + if bins == "blocks" and len(vals) >= astropy_sample_size: + _, bin_edges = histogram(np.random.choice(vals.ravel(), size=astropy_sample_size, replace=False), bins=bins) + elif bins == "knuth" and len(vals) >= astropy_sample_size: + _, bin_edges = histogram(np.random.choice(vals.ravel(), size=astropy_sample_size, replace=False), bins=bins) + else: + _, bin_edges = histogram(vals, bins=bins) + + if bin_edges[0] <= vals.min(): + bin_edges = bin_edges[1:] + + new_vals = ordinal_encoder.fit_transform(np.digitize(vals, bins=bin_edges, right=True)) + + else: + new_vals = ordinal_encoder.fit_transform(vals) + + return new_vals.ravel().astype(dtype) + + +def categorize_dataset_old( + df: pd.DataFrame, + method: str = "discretizer", + method_kwargs: dict = dict(strategy="quantile", n_bins=4), + min_ncats: int = 50, + astropy_sample_size: int = 10_000, + dtype=np.int16, + n_jobs: int = -1, + parallel_kwargs: dict = {}, +): + """ + Convert dataframe into ordinal-encoded one. + """ + + data = None + numerical_cols = [] + categorical_factors = [] + + numerical_cols = df.head(5).select_dtypes(exclude=("category", "object", "bool")).columns.values.tolist() + + data = [] + if n_jobs == -1 or n_jobs > 1: + fnc = delayed(categorize_1d_array) + else: + fnc = categorize_1d_array + + for col in tqdmu(numerical_cols, leave=False, desc="Binning of numericals"): + data.append( + fnc(vals=df[col].values, min_ncats=min_ncats, method=method, astropy_sample_size=astropy_sample_size, method_kwargs=method_kwargs, dtype=dtype) + ) + + if n_jobs == -1 or n_jobs > 1: + data = parallel_run(data, n_jobs=n_jobs, **parallel_kwargs) + data = np.vstack(data).T + + categorical_factors = df.select_dtypes(include=("category", "object", "bool")) + categorical_cols = [] + if categorical_factors.shape[1] > 0: + categorical_cols = categorical_factors.columns.values.tolist() + ordinal_encoder = OrdinalEncoder() + new_vals = ordinal_encoder.fit_transform(categorical_factors) + + max_cats = new_vals.max(axis=0) + exc_idx = max_cats > np.iinfo(dtype).max + n_max_cats = exc_idx.sum() + if n_max_cats: + logger.warning(f"{n_max_cats:_} factors exceeded dtype {dtype} and were truncated: {np.asarray(categorical_cols)[exc_idx]}") + new_vals = new_vals.astype(dtype) + + if data is None: + data = new_vals + else: + data = np.append(data, new_vals, axis=1) + + nbins = data.max(axis=0).astype(np.int32) + 1 - data.min(axis=0).astype(np.int32) + + return data, numerical_cols + categorical_cols, nbins + + +@njit +def digitize(arr: np.ndarray, bins: np.ndarray, dtype=np.int32) -> np.ndarray: + res = np.empty(len(arr), dtype=dtype) + for i, val in enumerate(arr): + for j, bin_edge in enumerate(bins): + if val <= bin_edge: + res[i] = j + break + return res + + +from numba import prange + + +# @njit() +def edges(arr, quantiles): + bin_edges = np.asarray(np.percentile(arr, quantiles)) + return bin_edges + + +@njit() +def quantize_dig(arr, bins): + return np.digitize(arr, bins[1:-1], right=True) + + +@njit() +def quantize_search(arr, bins): + return np.searchsorted(bins[1:-1], arr, side="right") + + +@njit() +def discretize_uniform(arr: np.ndarray, n_bins: int, min_value: float = None, max_value: float = None, dtype: object = np.int8) -> np.ndarray: + if min_value is None or max_value is None: + min_value, max_value = arrayMinMax(arr) + rev_bin_width = n_bins / (max_value - min_value + min_value / 2) + return ((arr - min_value) * rev_bin_width).astype(dtype) + + +@njit() +def discretize_array( + arr: np.ndarray, n_bins: int = 10, method: str = "quantile", min_value: float = None, max_value: float = None, dtype: object = np.int8 +) -> np.ndarray: + """Discretize cont variable into bins. + + Optimized version with mix of pure numpy and njitting. + + + %timeit quantize_search(df['a'].values,bins) #njitted + 24.6 ms ± 191 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + time: 2 s (started: 2024-02-09 19:58:31 +03:00) + + %timeit quantize_search(df['a'].values,bins) #just numpy + 27.2 ms ± 219 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + time: 2.2 s (started: 2024-02-09 19:52:59 +03:00) + + %timeit quantize_dig(df['a'].values, bins) #njitted + 23.7 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + time: 1.92 s (started: 2024-02-09 19:58:24 +03:00) + + %timeit quantize_dig(df['a'].values, bins) #just numpy + 31.1 ms ± 292 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + time: 2.52 s (started: 2024-02-09 19:53:01 +03:00) + + + """ + if method == "uniform": + return discretize_uniform(arr=arr, n_bins=n_bins, min_value=min_value, max_value=max_value, dtype=dtype) + elif method == "quantile": + bins_edges = get_binning_edges(arr=arr, n_bins=n_bins, method=method, min_value=min_value, max_value=max_value) # pure numpy + # return quantize_dig(arr,bins_edges).astype(dtype) #njitted + return quantize_search(arr, bins_edges).astype(dtype) # njitted + + +@njit(parallel=True) +def discretize_2d_array( + arr: np.ndarray, + n_bins: int = 10, + method: str = "quantile", + min_ncats: int = 50, + min_values: float = None, + max_values: float = None, + dtype: object = np.int8, +) -> np.ndarray: + """ """ + + res = np.empty_like(arr, dtype=dtype) + + for col in prange(arr.shape[1]): + res[:, col] = discretize_array( + arr=arr[:, col], + n_bins=n_bins, + method=method, + min_value=min_values[col] if min_values is not None else None, + max_value=max_values[col] if max_values is not None else None, + dtype=dtype, + ) + return res + + +@jit(nopython=False) +def get_binning_edges(arr: np.ndarray, n_bins: int = 10, method: str = "uniform", min_value: float = None, max_value: float = None): + """ + np.quantiles works faster when unjitted + + %timeit edges(df['a'].values,quantiles) #njitted + 83.9 ms ± 274 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + time: 6.81 s (started: 2024-02-09 17:36:50 +03:00) + + %timeit edges(df['a'].values,quantiles) #just numpy + 30.9 ms ± 541 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + time: 2.52 s (started: 2024-02-09 17:35:58 +03:00) + """ + if method == "uniform": + if min_value is None or max_value is None: + min_value, max_value = arrayMinMax(arr) + bin_edges = np.linspace(min_value, max_value, n_bins + 1) + + elif method == "quantile": + quantiles = np.linspace(0, 100, n_bins + 1) + bin_edges = np.asarray(np.percentile(arr, quantiles)) + + return bin_edges + + +def discretize_sklearn( + arr: np.ndarray, n_bins: int = 10, method: str = "uniform", min_value: float = None, max_value: float = None, dtype: object = np.int8 +) -> np.ndarray: + """Simplified vesrion taken from Sklearn's KBinsdiscretizer. + np.searchsorted runs twice faster when unjitted (as of Feb 2024 at least), so the func is not njitted. + """ + + bins_edges = get_binning_edges(arr=arr, n_bins=n_bins, method=method, min_value=min_value, max_value=max_value) + return np.searchsorted(bins_edges[1:-1], arr, side="right").astype(dtype) + + +def categorize_dataset( + df: pd.DataFrame, + method: str = "quantile", + n_bins: int = 4, + min_ncats: int = 50, + dtype=np.int16, +): + """ + Convert dataframe into ordinal-encoded one. + For cat columns uses OrdinalEncoder. + For the rest uses new discretize_2d_array. + Does not care for min_cats yet. + """ + + data = None + numerical_cols = [] + categorical_factors = [] + + numerical_cols = df.head(5).select_dtypes(exclude=("category", "object", "bool")).columns.values.tolist() + + data = discretize_2d_array(arr=df[numerical_cols].values, n_bins=n_bins, method=method, min_ncats=min_ncats, min_values=None, max_values=None, dtype=dtype) + + categorical_factors = df.select_dtypes(include=("category", "object", "bool")) + categorical_cols = [] + if categorical_factors.shape[1] > 0: + categorical_cols = categorical_factors.columns.values.tolist() + ordinal_encoder = OrdinalEncoder() + new_vals = ordinal_encoder.fit_transform(categorical_factors) + + max_cats = new_vals.max(axis=0) + exc_idx = max_cats > np.iinfo(dtype).max + n_max_cats = exc_idx.sum() + if n_max_cats: + logger.warning(f"{n_max_cats:_} factors exceeded dtype {dtype} and were truncated: {np.asarray(categorical_cols)[exc_idx]}") + new_vals = new_vals.astype(dtype) + + if data is None: + data = new_vals + else: + data = np.append(data, new_vals, axis=1) + + nbins = data.max(axis=0).astype(np.int32) + 1 # -data.min(axis=0).astype(np.int32) + + return data, numerical_cols + categorical_cols, nbins.tolist() + + +class MRMR(BaseEstimator, TransformerMixin): + """Finds subset of features having highest impact on target and least redundancy. + + Parameters + ---------- + cv : int, cross-validation generator or an iterable, default=None + + Attributes + ---------- + + + n_features_ : int + The number of selected features with cross-validation. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + ranking_ ?: narray of shape (n_features,) + The feature ranking, such that `ranking_[i]` + corresponds to the ranking + position of the i-th feature. + Selected (i.e., estimated best) + features are assigned rank 1. + + support_ : ndarray of shape (n_features,) + The mask of selected features. + + """ + + def __init__( + self, + # quantization + quantization_method: str = "quantile", + quantization_nbins: int = 10, + quantization_dtype: object = np.int16, + # factors + factors_names_to_use: Sequence[str] = None, + factors_to_use: Sequence[int] = None, + # algorithm + mrmr_relevance_algo: str = "fleuret", + mrmr_redundancy_algo: str = "fleuret", + reduce_gain_on_subelement_chosen: bool = True, + # performance + extra_x_shuffling: bool = True, + dtype=np.int32, + random_seed: int = None, + use_gpu: bool = False, + n_workers: int = 1, + # confidence + min_occupancy: int = None, + min_nonzero_confidence: float = 0.99, + full_npermutations: int = 1_000, + baseline_npermutations: int = 100, + # stopping conditions + min_relevance_gain: float = 0.00001, + max_consec_unconfirmed: int = 10, + max_runtime_mins: int = None, + interactions_min_order: int = 1, + interactions_max_order: int = 1, + interactions_order_reversed: bool = False, + max_veteranes_interactions_order: int = 1, + only_unknown_interactions: bool = False, + # feature engineering settings + fe_max_steps=1, + fe_npermutations=0, + fe_unary_preset="minimal", + fe_binary_preset="minimal", + fe_max_pair_features=1, + fe_min_nonzero_confidence=1.0, + fe_min_pair_mi_prevalence=1.05, # transformations of what exactly pairs of factors we consider, at all. mi of entire pair must be at least that higher than the mi of its individual factors. + fe_min_engineered_mi_prevalence=0.98, # mi of transformed pair must be at least that higher than the mi of the entire pair + fe_good_to_best_feature_mi_threshold=0.98, # when multiple good transformations exist for the same factors pair. + fe_max_polynoms: int = 0, + fe_print_best_mis_only: bool = True, + fe_smart_polynom_iters: int = 0, + fe_smart_polynom_optimization_steps: int = 1000, + fe_min_polynom_degree: int = 3, + fe_max_polynom_degree: int = 8, + fe_min_polynom_coeff: float = -10.0, + fe_max_polynom_coeff: float = 10.0, + # verbosity and formatting + verbose: Union[bool, int] = 0, + ndigits: int = 5, + parallel_kwargs=dict(max_nbytes=MAX_JOBLIB_NBYTES), + # CV + cv: Union[object, int, None] = 3, + cv_shuffle: bool = False, + # service + random_state: int = None, + n_jobs: int = -1, + skip_retraining_on_same_shape: bool = False, + # hidden + n_features_in_: int = 0, + feature_names_in_: Sequence = None, + support_: np.ndarray = None, + ): + + # checks + + # assert isinstance(estimator, (BaseEstimator,)) + + # save params + + params = get_parent_func_args() + store_params_in_object(obj=self, params=params) + self.signature = None + + def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.DataFrame, pd.Series, np.ndarray], groups: Union[pd.Series, np.ndarray] = None, **fit_params): + """We run N selections on data subsets, and pick only features that appear in all selections""" + + # ---------------------------------------------------------------------------------------------------------------------------- + # Compute inputs/outputs signature + # ---------------------------------------------------------------------------------------------------------------------------- + + signature = (X.shape, y.shape) + if self.skip_retraining_on_same_shape: + if signature == self.signature: + if self.verbose: + logger.info(f"Skipping retraining on the same inputs signature {signature}") + return self + + # --------------------------------------------------------------------------------------------------------------- + # Inits + # --------------------------------------------------------------------------------------------------------------- + + start_time = timer() + ran_out_of_time = False + + quantization_method = self.quantization_method + quantization_nbins = self.quantization_nbins + dtype = self.dtype + + max_runtime_mins = self.max_runtime_mins + random_state = self.random_state + parallel_kwargs = self.parallel_kwargs + n_jobs = self.n_jobs + verbose = self.verbose + cv_shuffle = self.cv_shuffle + cv = self.cv + + fe_max_steps = self.fe_max_steps + fe_npermutations = self.fe_npermutations + fe_unary_preset = self.fe_unary_preset + fe_binary_preset = self.fe_binary_preset + fe_max_pair_features = self.fe_max_pair_features + fe_min_nonzero_confidence = self.fe_min_nonzero_confidence + fe_min_pair_mi_prevalence = self.fe_min_pair_mi_prevalence + fe_min_engineered_mi_prevalence = self.fe_min_engineered_mi_prevalence + fe_good_to_best_feature_mi_threshold = self.fe_good_to_best_feature_mi_threshold + fe_max_polynoms = self.fe_max_polynoms + fe_print_best_mis_only = self.fe_print_best_mis_only + fe_smart_polynom_iters = self.fe_smart_polynom_iters + fe_smart_polynom_optimization_steps = self.fe_smart_polynom_optimization_steps + fe_min_polynom_degree = self.fe_min_polynom_degree + fe_max_polynom_degree = self.fe_max_polynom_degree + fe_min_polynom_coeff = self.fe_min_polynom_coeff + fe_max_polynom_coeff = self.fe_max_polynom_coeff + + # ---------------------------------------------------------------------------------------------------------------------------- + # Init cv + # ---------------------------------------------------------------------------------------------------------------------------- + + """ + if cv is None or str(cv).isnumeric(): + if cv is None: + cv = 3 + + if groups is not None: + cv = GroupKFold(n_splits=cv, shuffle=cv_shuffle, random_state=random_state) + else: + cv = KFold(n_splits=cv, shuffle=cv_shuffle, random_state=random_state) + + if verbose: + logger.info(f"Using cv={cv}") + """ + + self.feature_names_in_ = X.columns.tolist() + self.n_features_in_ = len(self.feature_names_in_) + + # --------------------------------------------------------------------------------------------------------------- + # Temporarily inject targets + # --------------------------------------------------------------------------------------------------------------- + + target_prefix = "targ_" + str(np.random.random())[3:9] + y_shape = y.shape + if len(y_shape) == 2: + y_shape = y_shape[1] + else: + y_shape = 1 + target_names = [target_prefix + str(i) for i in range(y_shape)] + + if isinstance(y, np.ndarray): + vals = y + else: + vals = y.values + + X[target_names] = vals.reshape(-1, 1) + + # --------------------------------------------------------------------------------------------------------------- + # Discretize continuous data + # --------------------------------------------------------------------------------------------------------------- + + data, cols, nbins = categorize_dataset( + df=X.ffill().bfill(), + method=self.quantization_method, + n_bins=self.quantization_nbins, + dtype=self.quantization_dtype, + ) + target_indices = [cols.index(col) for col in target_names] + + # --------------------------------------------------------------------------------------------------------------- + # Core + # --------------------------------------------------------------------------------------------------------------- + + """ + if random_state is not None: + set_random_seed(random_state) + + splitter = cv.split(X=X, y=y, groups=groups) + + subsets_selections=[] + if n_jobs==-1 or n_jobs>1: + fnc=delayed(screen_predictors) + else: + fnc=screen_predictors + + if verbose: + splitter = tqdmu(splitter, desc="CV folds", leave=False, total=cv.n_splits) + + for nfold, (train_index, test_index) in enumerate(splitter): + subsets_selections.append(fnc( + factors_data=data, + y=[target_idx], + factors_nbins=n_bins, + factors_names=cols, + interactions_max_order=1, + full_npermutations=0, + baseline_npermutations=10, + min_nonzero_confidence=1.0, + max_consec_unconfirmed=20, + min_relevance_gain=0.025, + max_runtime_mins=None, + max_veteranes_interactions_order=1, + reduce_gain_on_subelement_chosen=True, + random_seed=None, + use_gpu=False, + n_workers=n_jobs, + verbose=2, + ndigits=5, + parallel_kwargs=parallel_kwargs, + + )) + + if n_jobs==-1 or n_jobs>1: + subsets_selections = parallel_run(jobs,n_jobs=n_jobs,**parallel_kwargs) + + for selected_vars, predictors, any_influencing in subsets_selections: + pass + + """ + + """ + #service + random_state: int = None, + n_jobs:int=-1, + """ + + if fe_max_steps > 0: + unary_transformations = create_unary_transformations(preset=fe_unary_preset) + binary_transformations = create_binary_transformations(preset=fe_binary_preset) + if fe_max_polynoms: + polynomial_transformations = {} # 'identity':lambda x: x, + for _ in range(fe_max_polynoms): + length = np.random.randint(3, 9) + # coef=(np.random.random(length)-0.5)*1 + coef = np.empty(shape=length, dtype=np.float32) + for i in range(length): + coef[i] = np.random.normal((1.0 if i == 1 else 0.0), scale=0.05) + + unary_transformations["poly_" + str(coef)] = coef + + if verbose > 1: + print(f"nunary_transformations: {len(unary_transformations):_}") + print(f"nbinary_transformations: {len(binary_transformations):_}") + + categorical_vars = X.head().select_dtypes(include=("category", "object", "bool")).columns.values.tolist() + categorical_vars = [cols.index(col) for col in categorical_vars] + + engineered_features = set() + checked_pairs = set() + + num_fs_steps = 0 + while True: + n_recommended_features = 0 + times_spent = DefaultDict(float) + selected_vars, predictors, any_influencing, entropy_cache, cached_MIs, cached_confident_MIs, cached_cond_MIs, classes_y, classes_y_safe, freqs_y = ( + screen_predictors( + factors_data=data, + y=target_indices, + factors_nbins=nbins, + factors_names=cols, + factors_names_to_use=self.factors_names_to_use, + factors_to_use=self.factors_to_use, + # algorithm + mrmr_relevance_algo=self.mrmr_relevance_algo, + mrmr_redundancy_algo=self.mrmr_redundancy_algo, + reduce_gain_on_subelement_chosen=self.reduce_gain_on_subelement_chosen, + # performance + extra_x_shuffling=self.extra_x_shuffling, + dtype=self.dtype, + random_seed=self.random_seed, + use_gpu=self.use_gpu, + n_workers=self.n_workers, + # confidence + min_occupancy=self.min_occupancy, + min_nonzero_confidence=self.min_nonzero_confidence, + full_npermutations=self.full_npermutations, + baseline_npermutations=self.baseline_npermutations, + # stopping conditions + min_relevance_gain=self.min_relevance_gain, + max_consec_unconfirmed=self.max_consec_unconfirmed, + max_runtime_mins=self.max_runtime_mins, + interactions_min_order=self.interactions_min_order, + interactions_max_order=self.interactions_max_order, + interactions_order_reversed=self.interactions_order_reversed, + max_veteranes_interactions_order=self.max_veteranes_interactions_order, + only_unknown_interactions=self.only_unknown_interactions, + # verbosity and formatting + verbose=self.verbose, + ndigits=self.ndigits, + parallel_kwargs=self.parallel_kwargs, + ) + ) + + if fe_max_steps == 0 or num_fs_steps >= fe_max_steps: + break + + # Feature engineering part here + + numeric_vars_to_consider = set(selected_vars) - set(categorical_vars) + for raw_vars_pair in combinations(numeric_vars_to_consider, 2): + # check that every element of a pair has computed its MI with target + for var in raw_vars_pair: + if (var,) not in cached_confident_MIs and (var,) not in cached_MIs: + mi, conf = mi_direct( + data, + x=(var,), + y=target_indices, + factors_nbins=nbins, + classes_y=classes_y, + classes_y_safe=classes_y_safe, + freqs_y=freqs_y, + min_nonzero_confidence=fe_min_nonzero_confidence, + npermutations=fe_npermutations, + ) + cached_MIs[(var,)] = mi + + # ensure that pair as a whole has computed its MI with target + if raw_vars_pair not in cached_confident_MIs and raw_vars_pair not in cached_MIs: + mi, conf = mi_direct( + data, + x=raw_vars_pair, + y=target_indices, + factors_nbins=nbins, + classes_y=classes_y, + classes_y_safe=classes_y_safe, + freqs_y=freqs_y, + min_nonzero_confidence=fe_min_nonzero_confidence, + npermutations=fe_npermutations, + ) + cached_MIs[raw_vars_pair] = mi + + # --------------------------------------------------------------------------------------------------------------- + # For every pair of factors (A,B), select ones having MI((A,B),Y)>MI(A,Y)+MI(B,Y). Such ones must posess more special connection! + # --------------------------------------------------------------------------------------------------------------- + + vars_usage_counter = DefaultDict(int) + prospective_pairs = [] + for raw_vars_pair, pair_mi in sort_dict_by_value(cached_MIs).items(): + if len(raw_vars_pair) == 2: + if raw_vars_pair in checked_pairs: + continue + if raw_vars_pair[0] in numeric_vars_to_consider and raw_vars_pair[1] in numeric_vars_to_consider: + ind_elems_mi_sum = cached_MIs[(raw_vars_pair[0],)] + cached_MIs[(raw_vars_pair[1],)] + if pair_mi > ind_elems_mi_sum * fe_min_pair_mi_prevalence: + if verbose: + logger.info( + f"Factors pair {raw_vars_pair} will be considered for Feature Engineering, {pair_mi:.4f}>{ind_elems_mi_sum:.4f}, rat={pair_mi/ind_elems_mi_sum:.2f}" + ) + prospective_pairs.append((raw_vars_pair, pair_mi)) + for var in raw_vars_pair: + vars_usage_counter[var] += 1 + + if fe_smart_polynom_iters: + + # --------------------------------------------------------------------------------------------------------------- + # We search for best unary & binary transforms using Hermit polinomials & Optuna! + # Degrees kep reasonable small as a form of regularization. + # In theory (esp if inputs are normalized), Hermit polynomials can approximate any functional form, therefore replacing our + # random experimenting with arbitrary functions (that was pretty limited anyways). + # --------------------------------------------------------------------------------------------------------------- + + import optuna + from optuna.samplers import TPESampler + + optuna.logging.set_verbosity(optuna.logging.WARNING) + + def get_best_polynom_mi(coef_a, coef_b, vals_a, vals_b) -> float: + + transformed_var_a = hermval(vals_a, c=coef_a) + transformed_var_b = hermval(vals_b, c=coef_b) + + best_mi = -1 + best_config = None + + for bin_func_name, bin_func in binary_transformations.items(): + + final_transformed_vals = bin_func(transformed_var_a, transformed_var_b) + + discretized_transformed_values = discretize_array( + arr=final_transformed_vals, n_bins=self.quantization_nbins, method=self.quantization_method, dtype=self.quantization_dtype + ) + fe_mi, fe_conf = mi_direct( + discretized_transformed_values.reshape(-1, 1), + x=[0], + y=None, + factors_nbins=[self.quantization_nbins], + classes_y=classes_y, + classes_y_safe=classes_y_safe, + freqs_y=freqs_y, + min_nonzero_confidence=fe_min_nonzero_confidence, + npermutations=fe_npermutations, + ) + + if fe_mi > best_mi: + best_mi = fe_mi + best_config = bin_func_name + + return best_mi + + for raw_vars_pair, pair_mi in prospective_pairs: + vals_a = X.iloc[:, raw_vars_pair[0]].values + vals_b = X.iloc[:, raw_vars_pair[1]].values + + for _ in range(fe_smart_polynom_iters): + + length_a = np.random.randint(fe_min_polynom_degree, fe_max_polynom_degree) + length_b = np.random.randint(fe_min_polynom_degree, fe_max_polynom_degree) + + # Define an objective function to be minimized. + def objective(trial): + + coef_a = np.empty(shape=length_a, dtype=np.float32) + for i in range(length_a): + coef_a[i] = trial.suggest_float(f"a_{i}", fe_min_polynom_coeff, fe_max_polynom_coeff) + + coef_b = np.empty(shape=length_b, dtype=np.float32) + for i in range(length_b): + coef_b[i] = trial.suggest_float(f"b_{i}", fe_min_polynom_coeff, fe_max_polynom_coeff) + + res = get_best_polynom_mi(coef_a=coef_a, coef_b=coef_b, vals_a=vals_a, vals_b=vals_b) + + return res + + study = optuna.create_study(direction="maximize", sampler=TPESampler(multivariate=True)) # Create a new study. + study.optimize(objective, n_trials=fe_smart_polynom_optimization_steps) # Invoke optimization of the objective function. + + print(f"Best MI: {study.best_trial.value:.4f}, pair_mi={pair_mi:.4f}") + print(f"Best hyperparameters: {study.best_params}") + else: + + # --------------------------------------------------------------------------------------------------------------- + # Starting from the most heavily connected pairs, create a big pool of original features+their unary transforms. + # --------------------------------------------------------------------------------------------------------------- + + # individual vars referenced more than once go to the global pool, rest to the local (not stored)? + + transformed_vars = np.empty(shape=(len(X), len(prospective_pairs) * len(unary_transformations) * 2), dtype=np.float32) + vars_transformations = {} + i = 0 + for raw_vars_pair, pair_mi in prospective_pairs: + for var in raw_vars_pair: + vals = X.iloc[:, var].values + for tr_name, tr_func in unary_transformations.items(): + key = (var, tr_name) + if key not in vars_transformations: + if "poly_" in tr_name: + transformed_vars[:, i] = hermval(vals, c=tr_func) + else: + transformed_vars[:, i] = tr_func(vals) + vars_transformations[key] = i + i += 1 + # --------------------------------------------------------------------------------------------------------------- + # Then, for every pair from the pool, try all known functions of 2 variables (not storing results in persistent RAM). + # Record best pairs. + # --------------------------------------------------------------------------------------------------------------- + + for raw_vars_pair, pair_mi in prospective_pairs: # !TODO! better to start considering form the most prospective pairs with highest mis ratio! + combs = list( + combinations( + [(raw_vars_pair[0], key) for key in unary_transformations.keys()] + + [(raw_vars_pair[1], key) for key in unary_transformations.keys()], + 2, + ) + ) + combs = [ + transformations_pair for transformations_pair in combs if transformations_pair[0][0] != transformations_pair[1][0] + ] # let's skip trying to transform the same factor for now + # print(f"trying {len(combs):_} combs") + + best_config, best_mi = None, -1 + var_pairs_perf = {} + + final_transformed_vals = np.empty( + shape=(len(X), len(combs) * len(binary_transformations)), dtype=np.float32 + ) # !TODO! optimize allocation of this array before the main loop! + + i = 0 + for transformations_pair in tqdmu(combs, desc=f"transforming {raw_vars_pair}", leave=False): + + param_a = transformed_vars[:, vars_transformations[transformations_pair[0]]] + param_b = transformed_vars[:, vars_transformations[transformations_pair[1]]] + + for bin_func_name, bin_func in binary_transformations.items(): + + start = timer() + final_transformed_vals[:, i] = bin_func(param_a, param_b) + times_spent[bin_func_name] += timer() - start + + discretized_transformed_values = discretize_array( + arr=final_transformed_vals[:, i], n_bins=self.quantization_nbins, method=self.quantization_method, dtype=self.quantization_dtype + ) + fe_mi, fe_conf = mi_direct( + discretized_transformed_values.reshape(-1, 1), + x=[0], + y=None, + factors_nbins=[self.quantization_nbins], + classes_y=classes_y, + classes_y_safe=classes_y_safe, + freqs_y=freqs_y, + min_nonzero_confidence=fe_min_nonzero_confidence, + npermutations=fe_npermutations, + ) + + config = (transformations_pair, bin_func_name, i) + var_pairs_perf[config] = fe_mi + + if fe_mi > best_mi: + best_mi = fe_mi + best_config = config + if fe_mi > best_mi * 0.85: + if not fe_print_best_mis_only or (fe_mi == best_mi): + if verbose > 1: + print(f"MI of transformed pair {bin_func_name}({transformations_pair})={fe_mi:.4f}, MI of the plain pair {pair_mi:.4f}") + i += 1 + if verbose > 1: + print(f"For pair {raw_vars_pair}, best config is {best_config} with best mi= {best_mi}") + + if best_mi / pair_mi > fe_min_engineered_mi_prevalence * (1.0 if num_fs_steps < 1 else 1.025): + + # --------------------------------------------------------------------------------------------------------------- + # Now, if there is a group of leaders with almost same performance, we need to approve them through some of the orther variables. + # если будут возникать такие группы примерно одинаковых по силе лидеров, их придётся разрешать с помощью одного из других влияющих факторов + # --------------------------------------------------------------------------------------------------------------- + + leading_features = [] + for next_config, next_mi in sort_dict_by_value(var_pairs_perf).items(): + if next_mi > best_mi * fe_good_to_best_feature_mi_threshold: + leading_features.append(next_config) + + if len(leading_features) > 1: + if len(numeric_vars_to_consider) > 2: + + new_vals = np.empty(shape=(len(X), fe_max_pair_features), dtype=self.quantization_dtype) + new_nbins = [] + new_cols = [] + + if verbose > 1: + print(f"Taking {len(leading_features)} new features for a separate validation step!") + + # --------------------------------------------------------------------------------------------------------------- + # Now let's test all of the candidates as is against the rest of the approved factors (also as is). + # Caindidates significantly outstanding (in terms of MI with target) with any of other approved factors are kept. + # --------------------------------------------------------------------------------------------------------------- + + valid_pairs_perf = {} + + for transformations_pair, bin_func_name, i in leading_features: + param_a = final_transformed_vals[:, i] + + best_valid_mi = -1 + config = (transformations_pair, bin_func_name, i) + + for external_factor in tqdmu( + set(numeric_vars_to_consider) - set(raw_vars_pair), desc="external validation factor", leave=False + ): + param_b = X.iloc[:, external_factor].values + + for valid_bin_func_name, valid_bin_func in binary_transformations.items(): + + valid_vals = valid_bin_func(param_a, param_b) + + discretized_transformed_values = discretize_array( + arr=valid_vals, n_bins=self.quantization_nbins, method=self.quantization_method, dtype=self.quantization_dtype + ) + fe_mi, fe_conf = mi_direct( + discretized_transformed_values.reshape(-1, 1), + x=[0], + y=None, + factors_nbins=[self.quantization_nbins], + classes_y=classes_y, + classes_y_safe=classes_y_safe, + freqs_y=freqs_y, + min_nonzero_confidence=fe_min_nonzero_confidence, + npermutations=fe_npermutations, + ) + + if fe_mi > best_valid_mi: + best_valid_mi = fe_mi + if verbose > 1: + print( + f"MI of transformed pair {valid_bin_func_name}({(transformations_pair,bin_func_name)} with ext factor {external_factor})={fe_mi:.4f}" + ) + + valid_pairs_perf[config] = best_valid_mi + + # --------------------------------------------------------------------------------------------------------------- + # Now we recommend proceeding with top N best transformations! + # --------------------------------------------------------------------------------------------------------------- + + for j, (config, valid_mi) in enumerate(sort_dict_by_value(valid_pairs_perf, reverse=True).items()): + if j < fe_max_pair_features: + new_feature_name = get_new_feature_name(fe_tuple=config, cols_names=cols) + if verbose: + logger.info( + f"{new_feature_name} is recommended to use as a new feature! (won in validation with other factors) best_mi={best_mi:.4f}, pair_mi={pair_mi:.4f}, rat={best_mi/pair_mi:.4f}" + ) + transformations_pair, bin_func_name, i = config + + new_vals[:, j] = discretize_array( + arr=final_transformed_vals[:, i], + n_bins=self.quantization_nbins, + method=self.quantization_method, + dtype=self.quantization_dtype, + ) + new_nbins += [self.quantization_nbins] + new_cols += [new_feature_name] + X[new_feature_name] = final_transformed_vals[:, i] + engineered_features.add((transformations_pair, bin_func_name)) + + else: + break + new_vals = new_vals[:, : min(fe_max_pair_features, j)] + else: + if verbose: + logger.warning( + f"{len(leading_features)} are recommended to use as new features! (can't narrow down the list by validation with other factors) best_mi={best_mi:.4f}, pair_mi={pair_mi:.4f}, rat={best_mi/pair_mi:.4f}" + ) + + for j, (transformations_pair, bin_func_name, i) in enumerate(leading_features): + new_vals[:, j] = discretize_array( + arr=final_transformed_vals[:, i], + n_bins=self.quantization_nbins, + method=self.quantization_method, + dtype=self.quantization_dtype, + ) + new_nbins += [self.quantization_nbins] + new_cols += [new_feature_name] + X[new_feature_name] = final_transformed_vals[:, i] + engineered_features.add((transformations_pair, bin_func_name)) + else: + new_feature_name = get_new_feature_name(fe_tuple=best_config, cols_names=cols) + if verbose: + logger.info( + f"{new_feature_name} is recommended to use as a new feature! (clear winner) best_mi={best_mi:.4f}, pair_mi={pair_mi:.4f}, rat={best_mi/pair_mi:.4f}" + ) + + transformations_pair, bin_func_name, i = best_config + + new_vals = discretize_array( + arr=final_transformed_vals[:, i], n_bins=self.quantization_nbins, method=self.quantization_method, dtype=self.quantization_dtype + ).reshape(-1, 1) + new_nbins = [self.quantization_nbins] + new_cols = [new_feature_name] + X[new_feature_name] = final_transformed_vals[:, i] + engineered_features.add((transformations_pair, bin_func_name)) + + data = np.append(data, new_vals, axis=1) + nbins = nbins + new_nbins + cols = cols + new_cols + + n_recommended_features += len(new_cols) + + # !TODO! handle factors_to_use etc + """ + factors_names_to_use=self.factors_names_to_use, + factors_to_use=self.factors_to_use, + """ + checked_pairs.add(raw_vars_pair) + + if n_recommended_features == 0: + break + num_fs_steps += 1 + if verbose > 1: + print("time spent by binary func:", sort_dict_by_value(times_spent)) + # Possibly decide on eliminating original features? (if constructed ones cover 90%+ of MI) + + # --------------------------------------------------------------------------------------------------------------- + # Drop Temporarily targets + # --------------------------------------------------------------------------------------------------------------- + + X.drop(columns=target_names, inplace=True) + + # --------------------------------------------------------------------------------------------------------------- + # assign support + # --------------------------------------------------------------------------------------------------------------- + + self.support_ = selected_vars + if selected_vars: + self.n_features_ = len(selected_vars) + else: + self.n_features_ = 0 + + # --------------------------------------------------------------------------------------------------------------- + # assign extra vars for upcoming vars improving + # --------------------------------------------------------------------------------------------------------------- + + # self.cached_MIs_ = cached_MIs + # self.cached_cond_MIs_ = cached_cond_MIs + # self.cached_confident_MIs_ = cached_confident_MIs + + # --------------------------------------------------------------------------------------------------------------- + # Report FS results + # --------------------------------------------------------------------------------------------------------------- + + if verbose: + logger.info(f"MRMR+ selected {self.n_features_:_} out of {self.n_features_in_:_} features: {predictors}") + + self.signature = signature + return self + + def transform(self, X, y=None): + if isinstance(X, pd.DataFrame): + return X.iloc[:, self.support_] + else: + return X[:, self.support_] + + +def get_existing_feature_name(fe_tuple: tuple, cols_names: Sequence) -> str: + fname = cols_names[fe_tuple[0]] + if fe_tuple[1] == "identity": + return fname + else: + return f"{fe_tuple[1]}({fname})" + + +def get_new_feature_name(fe_tuple: tuple, cols_names: Sequence) -> str: + return f"{fe_tuple[1]}({get_existing_feature_name(fe_tuple=fe_tuple[0][0],cols_names=cols_names)},{get_existing_feature_name(fe_tuple=fe_tuple[0][1],cols_names=cols_names)})" # (((2, 'log'), (3, 'sin')), 'mul', 1016) + + +def njit_functions_dict(dict, exceptions: Sequence = ("grad1", "grad2", "sinc", "logn", "greater", "less", "equal")): + """Tries replacing funcs in the dict with their njitted equivqlents, caring for exceptions.""" + for key, func in dict.items(): + if key not in exceptions: + try: + dict[key] = njit(func) + except Exception as e: + pass + + +def create_unary_transformations(preset: str = "minimal"): + unary_constraints = { + # reverse trigonometric + "arccos": "-1to1", + "arcsin": "-1to1", + "arctan": "-pi/2topi/2", + # reverse hyperbolic + "arccosh": "1toinf", + "arctanh": "-0.(9)to0.(9)", + # powers + "sqrt": "pos", + "log": "pos", + "reciproc": "nonzero", + "invsquared": "nonzero", + "invqubed": "nonzero", + "invcbrt": "nonzero", + "invsqrt": "nonzero", + } + + unary_transformations = { + # simplest + "identity": lambda x: x, + } + if preset != "minimal": + unary_transformations.update( + { + "sign": np.sign, + "neg": np.negative, + "abs": np.abs, + # outliers removal + # Rounding + "rint": np.rint, + # np.modf Return the fractional and integral parts of an array, element-wise. + # clip + # powers + "squared": lambda x: np.power(x, 2), + "qubed": lambda x: np.power(x, 3), + "reciproc": lambda x: np.power(x, -1), + "invsquared": lambda x: np.power(x, -2), + "invqubed": lambda x: np.power(x, -3), + "cbrt": np.cbrt, + "sqrt": np.sqrt, + "invcbrt": lambda x: np.power(x, -1 / 3), + "invsqrt": lambda x: np.power(x, -1 / 2), + # logarithms + "log": np.log, + "exp": np.exp, + # trigonometric + "sin": np.sin, + } + ) + + if preset == "maximal": + unary_transformations.update( + { + # math an + "grad1": np.gradient, + "grad2": lambda x: np.gradient(x, edge_order=2), + # trigonometric + "sinc": np.sinc, + "cos": np.cos, + "tan": np.tan, + # reverse trigonometric + "arcsin": np.arcsin, + "arccos": np.arccos, + "arctan": np.arctan, + # hyperbolic + "sinh": np.sinh, + "cosh": np.cosh, + "tanh": np.tanh, + # reverse hyperbolic + "arcsinh": np.arcsinh, + "arccosh": np.arccosh, + "arctanh": np.arctanh, + # special + #'psi':sp.psi, polygamma(0,x) is same as psi + "erf": sp.erf, + "dawsn": sp.dawsn, + "gammaln": sp.gammaln, + #'spherical_jn':sp.spherical_jn + } + ) + + njit_functions_dict(unary_transformations) + + if preset == "maximal": + for order in range(3): + unary_transformations["polygamma_" + str(order)] = lambda x: sp.polygamma(order, x) + unary_transformations["struve" + str(order)] = lambda x: sp.struve(order, x) + unary_transformations["jv" + str(order)] = lambda x: sp.jv(order, x) + + """j0 + faster version of this function for order 0. + + j1 + faster version of this function for order 1. + """ + + return unary_transformations + + +def create_binary_transformations(preset: str = "minimal"): + + binary_transformations = { + # Basic + "mul": np.multiply, + "add": np.add, + # Extrema + "max": np.maximum, + "min": np.minimum, + } + + if preset == "maximal": + binary_transformations.update( + { + # All kinds of averages + "hypot": np.hypot, + "logaddexp": np.logaddexp, + "agm": sp.agm, # Compute the arithmetic-geometric mean of a and b. + # Rational routines + #'lcm':np.lcm, # requires int arguments # ufunc 'lcm' did not contain a loop with signature matching types (, ) -> None + #'gcd':np.gcd, # requires int arguments + #'mod':np.remainder, # requires int arguments + # Powers + "pow": np.power, # non-symmetrical! may required dtype=complex for arbitrary numbers + # Logarithms + "logn": lambda x, y: np.emath.logn(x - np.min(x) + 0.1, y - np.min(y) + 0.1), # non-symmetrical! + # DSP + # 'convolve':np.convolve, # symmetrical wrt args. scipy.signal.fftconvolve should be faster? SLOW? + # Linalg + "heaviside": np.heaviside, # non-symmetrical! + #'cross':np.cross, # symmetrical # incompatible dimensions for cross product (dimension must be 2 or 3) + # Comparison + "greater": lambda x, y: np.greater(x, y).astype(int), + "less": lambda x, y: np.less(x, y).astype(int), + "equal": lambda x, y: np.equal(x, y).astype(int), + # special + "beta": sp.beta, # symmetrical + "binom": sp.binom, # non-symmetrical! binomial coefficient considered as a function of two real variables. + } + ) + + njit_functions_dict(binary_transformations) + + return binary_transformations diff --git a/mlframe/feature_selection/wrappers.py b/mlframe/feature_selection/wrappers.py new file mode 100644 index 0000000..f9a1b8b --- /dev/null +++ b/mlframe/feature_selection/wrappers.py @@ -0,0 +1,872 @@ +"""Feature selection within ML pipelines. Wrappers methods. Currently includes recursive feature elimination.""" + +# ---------------------------------------------------------------------------------------------------------------------------- +# LOGGING +# ---------------------------------------------------------------------------------------------------------------------------- + +import logging + +logger = logging.getLogger(__name__) + + +while True: + try: + + # ---------------------------------------------------------------------------------------------------------------------------- + # Normal Imports + # ---------------------------------------------------------------------------------------------------------------------------- + + from typing import * + + import pandas as pd, numpy as np + + from pyutilz.system import tqdmu + from pyutilz.numbalib import set_numba_random_seed + from pyutilz.pythonlib import store_params_in_object, get_parent_func_args + + from sklearn.pipeline import Pipeline + + from mlframe.config import * + from mlframe.utils import set_random_seed + from mlframe.baselines import get_best_dummy_score + from mlframe.helpers import has_early_stopping_support + from mlframe.preprocessing import pack_val_set_into_fit_params + from mlframe.metrics import sklearn_integral_calibration_error + from mlframe.optimization import * + + from sklearn.dummy import DummyClassifier, DummyRegressor + from sklearn.metrics import make_scorer, mean_squared_error + from sklearn.base import is_classifier, is_regressor, BaseEstimator, TransformerMixin + from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold, StratifiedShuffleSplit, GroupKFold, GroupShuffleSplit, KFold + + from enum import Enum, auto + from timeit import default_timer as timer + + import matplotlib.pyplot as plt + + import random + import copy + + except ModuleNotFoundError as e: + + logger.warning(e) + + if "cannot import name" in str(e): + raise (e) + + # ---------------------------------------------------------------------------------------------------------------------------- + # Packages auto-install + # ---------------------------------------------------------------------------------------------------------------------------- + + from pyutilz.pythonlib import ensure_installed + + ensure_installed("numpy pandas scikit-learn") # cupy + + else: + break + +# ---------------------------------------------------------------------------------------------------------------------------- +# Inits +# ---------------------------------------------------------------------------------------------------------------------------- + + +class OptimumSearch(str, Enum): + ScipyLocal = "ScipyLocal" # Brent + ScipyGlobal = "ScipyGlobal" # direct, diff evol, shgo + ModelBasedHeuristic = "ModelBasedHeuristic" # GaussianProcess or Catboost with uncertainty, or quantile regression + ExhaustiveRandom = "ExhaustiveRandom" + ExhaustiveDichotomic = "ExhaustiveDichotomic" + + +class VotesAggregation(str, Enum): + Minimax = "Minimax" + OG = "OG" + Borda = "Borda" + Plurality = "Plurality" + Dowdall = "Dowdall" + Copeland = "Copeland" + AM = "AM" + GM = "GM" + + +class RFECV(BaseEstimator, TransformerMixin): + """Finds subset of features having best CV score, by iterative narrowing down set of top_n candidates having highest importance, as per estimator's FI scores. + + Optimizes mean CV scores (possibly accounting for variation, possibly translated into ranks) divided by the features number. + + Uses several optimization methods: + exhaustive search + random search + model-based heuristic search. + + Problems: + Impactful, but correlated factors all get low importance and will be thrown away (probably only for forests, not boostings?). + confirmed for boostings also! adding more predictors to original features worsens scores, whereas in theory it at least should not be worse! + + Due to noise some random features can become "important". + + Solution: + use CV to calculate fold FI, then combine across folds (by voting). + When estimating featureset quality at another TopN, use different splits & combine new FIs with all known before, to mitigate noise even more. + + Optionally plots (and saves) the optimization path - checked nfeatures and corresponding scores. + If surrogate models are used, also shows predicted scores along with confidence bounds. + + Challenges: + CV performance itself can be a multi-component value! Say, both ROC AUC and CALIB metrics can be considered. Voting can be a solution. + Estimator might itself be a HPT search instance. Or a pipeline. + It could be good to have several estimators. Their importance evaluations must be accounted for simultaneously (voting). + Estimator might need eval_set or similar (eval_frac). + Different folds invocations could benefit from generating all possible hyper parameters. Even if FS does not care, collected info could be used further at the HPT step. + + Parameters + ---------- + cv : int, cross-validation generator or an iterable, default=None + + Attributes + ---------- + + estimator_ : ``Estimator`` instance + The fitted estimator used to select features. + + cv_results_ : dict of ndarrays + A dict with keys: + + split(k)_test_score : ndarray of shape (n_subsets_of_features,) + The cross-validation scores across (k)th fold. + + mean_test_score : ndarray of shape (n_subsets_of_features,) + Mean of scores over the folds. + + std_test_score : ndarray of shape (n_subsets_of_features,) + Standard deviation of scores over the folds. + + + n_features_ : int + The number of selected features with cross-validation. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + ranking_ ?: narray of shape (n_features,) + The feature ranking, such that `ranking_[i]` + corresponds to the ranking + position of the i-th feature. + Selected (i.e., estimated best) + features are assigned rank 1. + + support_ : ndarray of shape (n_features,) + The mask of selected features. + + """ + + def __init__( + self, + estimator: BaseEstimator, + fit_params: dict = {}, + max_nfeatures: int = None, + mean_perf_weight: float = 1.0, + std_perf_weight: float = 0.1, + feature_cost: float = 0.00 / 100, + smooth_perf: int = 3, + # stopping conditions + max_runtime_mins: float = None, + max_refits: int = None, + best_desired_score: float = None, + max_noimproving_iters: int = 30, + # CV + cv: Union[object, int, None] = 3, + cv_shuffle: bool = False, + # Other + early_stopping_val_nsplits: Union[int, None] = 4, + early_stopping_rounds: Union[int, None] = None, + scoring: Union[object, None] = None, + top_predictors_search_method: OptimumSearch = OptimumSearch.ModelBasedHeuristic, + votes_aggregation_method: VotesAggregation = VotesAggregation.Borda, + use_all_fi_runs: bool = True, + use_last_fi_run_only: bool = False, + use_one_freshest_fi_run: bool = False, + use_fi_ranking: bool = False, + importance_getter: Union[str, Callable, None] = None, + random_state: int = None, + leave_progressbars: bool = True, + verbose: Union[bool, int] = 0, + show_plot: bool = False, + cat_features: Union[Sequence, None] = None, + keep_estimators: bool = False, + estimators_save_path: str = None, # fitted estimators get saved into join(estimators_save_path,estimator_type_name,nestimator_nfeatures_nfold.dump) + # Required features and achieved ml metrics get saved in a dict join(estimators_save_path,required_features.dump). + frac: float = None, + skip_retraining_on_same_shape: bool = False, + ): + + # checks + if frac is not None: + assert frac > 0.0 and frac < 1.0 + + # assert isinstance(estimator, (BaseEstimator,)) + + # save params + + params = get_parent_func_args() + store_params_in_object(obj=self, params=params) + self.signature = None + + def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.DataFrame, pd.Series, np.ndarray], groups: Union[pd.Series, np.ndarray] = None, **fit_params): + + # ---------------------------------------------------------------------------------------------------------------------------- + # Compute inputs/outputs signature + # ---------------------------------------------------------------------------------------------------------------------------- + + signature = (X.shape, y.shape) + if self.skip_retraining_on_same_shape: + if signature == self.signature: + if self.verbose: + logger.info(f"Skipping retraining on the same inputs signature {signature}") + return self + + # --------------------------------------------------------------------------------------------------------------- + # Inits + # --------------------------------------------------------------------------------------------------------------- + + estimator = self.estimator + fit_params = copy.copy(self.fit_params) + max_runtime_mins = self.max_runtime_mins + max_refits = self.max_refits + cv = self.cv + cv_shuffle = self.cv_shuffle + early_stopping_val_nsplits = self.early_stopping_val_nsplits + early_stopping_rounds = self.early_stopping_rounds + scoring = self.scoring + top_predictors_search_method = self.top_predictors_search_method + votes_aggregation_method = self.votes_aggregation_method + use_all_fi_runs = self.use_all_fi_runs + use_last_fi_run_only = self.use_last_fi_run_only + use_one_freshest_fi_run = self.use_one_freshest_fi_run + use_fi_ranking = self.use_fi_ranking + importance_getter = self.importance_getter + random_state = self.random_state + leave_progressbars = self.leave_progressbars + verbose = self.verbose + show_plot = self.show_plot + cat_features = self.cat_features + keep_estimators = self.keep_estimators + feature_cost = self.feature_cost + smooth_perf = self.smooth_perf + frac = self.frac + best_desired_score = self.best_desired_score + max_noimproving_iters = self.max_noimproving_iters + + start_time = timer() + ran_out_of_time = False + + if random_state is not None: + set_random_seed(random_state) + + feature_importances = {} + evaluated_scores_std = {} + evaluated_scores_mean = {} + + if isinstance(X, pd.DataFrame): + original_features = X.columns.tolist() + else: + original_features = np.arange(X.shape[1]) + + # ---------------------------------------------------------------------------------------------------------------------------- + # Init cv + # ---------------------------------------------------------------------------------------------------------------------------- + + if cv is None or str(cv).isnumeric(): + if cv is None: + cv = 3 + if is_classifier(estimator): + if groups is not None: + cv = StratifiedGroupKFold(n_splits=cv, shuffle=cv_shuffle, random_state=random_state) + else: + cv = StratifiedKFold(n_splits=cv, shuffle=cv_shuffle, random_state=random_state) + else: + if groups is not None: + cv = GroupKFold(n_splits=cv, shuffle=cv_shuffle, random_state=random_state) + else: + cv = KFold(n_splits=cv, shuffle=cv_shuffle, random_state=random_state) + if verbose: + logger.info(f"Using cv={cv}") + + if early_stopping_val_nsplits: + val_cv = copy.copy(cv) + val_cv.n_splits = early_stopping_val_nsplits + if not early_stopping_rounds: + early_stopping_rounds = 20 # TODO: derive as 1/5 of nestimators' + else: + val_cv = None + + if verbose: + iters_pbar = tqdmu( + desc="RFECV iterations", leave=leave_progressbars, total=min(len(original_features), max_refits) if max_refits else len(original_features) + ) + + # ---------------------------------------------------------------------------------------------------------------------------- + # Init scoring + # ---------------------------------------------------------------------------------------------------------------------------- + + if scoring is None: + if is_classifier(estimator): + scoring = make_scorer(score_func=sklearn_integral_calibration_error, needs_proba=True, needs_threshold=False, greater_is_better=False) + elif is_regressor(estimator): + scoring = make_scorer(score_func=mean_squared_error, needs_proba=False, needs_threshold=False, greater_is_better=False) + else: + raise ValueError(f"Scoring not known for estimator type: {estimator}") + + # ---------------------------------------------------------------------------------------------------------------------------- + # Init importance_getter + # ---------------------------------------------------------------------------------------------------------------------------- + + if isinstance(estimator, Pipeline): + estimator_type = type(estimator.steps[-1][1]).__name__ + else: + estimator_type = type(estimator).__name__ + + if importance_getter is None or importance_getter == "auto": + if estimator_type in ("LogisticRegression",): + importance_getter = "coef_" + else: + importance_getter = "feature_importances_" + + # ---------------------------------------------------------------------------------------------------------------------------- + # Start evaluating different nfeatures, being guided by the selected search method + # ---------------------------------------------------------------------------------------------------------------------------- + + nsteps = 0 + dummy_scores = [] + fitted_estimators = {} + selected_features_per_nfeatures = {} + + if top_predictors_search_method == OptimumSearch.ModelBasedHeuristic: + Optimizer = MBHOptimizer( + search_space=( + np.array(np.arange(min(self.max_nfeatures, len(original_features)) + 1).tolist() + [len(original_features)]) + if self.max_nfeatures + else np.arange(len(original_features) + 1) + ), + direction=OptimizationDirection.Maximize, + init_sampling_method=CandidateSamplingMethod.Equidistant, + init_evaluate_ascending=False, + init_evaluate_descending=True, + plotting=OptimizationProgressPlotting.OnScoreImprovement, + seeded_inputs=[min(2, len(original_features))], + ) + else: + Optimizer = None + + n_noimproving_iters = 0 + best_score = -1e6 + + while nsteps < len(original_features): + + if verbose: + iters_pbar.update(1) + + # ---------------------------------------------------------------------------------------------------------------------------- + # Select current set of features to work on, based on ranking received so far, and the optimum search method + # ---------------------------------------------------------------------------------------------------------------------------- + + current_features = get_next_features_subset( + nsteps=nsteps, + original_features=original_features, + feature_importances=feature_importances, + evaluated_scores_mean=evaluated_scores_mean, + evaluated_scores_std=evaluated_scores_std, + use_all_fi_runs=use_all_fi_runs, + use_last_fi_run_only=use_last_fi_run_only, + use_one_freshest_fi_run=use_one_freshest_fi_run, + use_fi_ranking=use_fi_ranking, + top_predictors_search_method=top_predictors_search_method, + votes_aggregation_method=votes_aggregation_method, + Optimizer=Optimizer, + ) + + if current_features is None or len(current_features) == 0: + break # nothing more to try + + selected_features_per_nfeatures[len(current_features)] = current_features + + # ---------------------------------------------------------------------------------------------------------------------------- + # Each split better be different. so, even if random_state is provided, random_state to the cv is generated separately + # (and deterministically) each time based on the original random_state. + # ---------------------------------------------------------------------------------------------------------------------------- + + scores = [] + + splitter = cv.split(X=X, y=y, groups=groups) + if verbose: + splitter = tqdmu(splitter, desc="CV folds", leave=False, total=cv.n_splits) + + # ---------------------------------------------------------------------------------------------------------------------------- + # Evaluate currently selected set of features on CV + # ---------------------------------------------------------------------------------------------------------------------------- + + for nfold, (train_index, test_index) in enumerate(splitter): + + if frac: + size = int(len(train_index) * frac) + if size > 10: + train_index = np.random.choice(train_index, size=size) + + X_train, y_train, X_test, y_test = split_into_train_test( + X=X, y=y, train_index=train_index, test_index=test_index, features_indices=current_features + ) # this splits both dataframes & ndarrays in the same fashion + + if val_cv and has_early_stopping_support(estimator_type): + + # ---------------------------------------------------------------------------------------------------------------------------- + # Make additional early stopping split from X_train + # ---------------------------------------------------------------------------------------------------------------------------- + + if groups is not None: + if isinstance(groups, pd.Series): + train_groups = groups.iloc[train_index] + else: + train_groups = groups[train_index] + else: + train_groups = None + + for true_train_index, val_index in val_cv.split(X=X_train, y=y_train, groups=train_groups): + break # need only 1 iteration of 2nd split + + X_train, y_train, X_val, y_val = split_into_train_test(X=X_train, y=y_train, train_index=true_train_index, test_index=val_index) + + # ---------------------------------------------------------------------------------------------------------------------------- + # If estimator is known, apply early stopping to its fit params + # ---------------------------------------------------------------------------------------------------------------------------- + + temp_cat_features = [current_features.index(var) for var in cat_features if var in current_features] if cat_features else None + + temp_fit_params = pack_val_set_into_fit_params( + model=estimator, + X_val=X_val, + y_val=y_val, + early_stopping_rounds=early_stopping_rounds, + cat_features=temp_cat_features, + ) # crafts fit params with early stopping tailored to particular model type. + temp_fit_params.update(fit_params) + + else: + temp_fit_params = {} + X_val = None + + # ---------------------------------------------------------------------------------------------------------------------------- + # Fit our estimator on current train fold. Score on test & and get its feature importances. + # ---------------------------------------------------------------------------------------------------------------------------- + + # TODO! invoke different hyper parameters generation here + + if keep_estimators: + fitted_estimator = copy.copy(estimator) + else: + fitted_estimator = estimator + + fitted_estimator.fit(X=X_train, y=y_train, **temp_fit_params) + + score = scoring(fitted_estimator, X_test, y_test) + scores.append(score) + fi = get_feature_importances( + model=fitted_estimator, current_features=current_features, data=X_test, reference_data=X_val, importance_getter=importance_getter + ) + # feature_indices,imp_values=list(fi.keys()),list(fi.values()) + # print(np.array(feature_indices)[np.argsort(imp_values)[-10:]]) + + key = f"{len(current_features)}_{nfold}" + feature_importances[key] = fi + + if keep_estimators: + fitted_estimators[key] = fitted_estimator + + # print(f"feature_importances[step{len(current_features)}_fold{nfold}]=" + str({key: value for key, value in fi.items() if value > 0})) + + if 0 not in evaluated_scores_mean: + + # ---------------------------------------------------------------------------------------------------------------------------- + # Dummy baselines must serve as fitness @ 0 features. + # ---------------------------------------------------------------------------------------------------------------------------- + dummy_scores.append( + get_best_dummy_score(estimator=estimator, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, scoring=scoring) + ) + # print(f"Best dummy score (at 0 features, fold {nfold}): {best_dummy_score}") + + if 0 not in evaluated_scores_mean: + store_averaged_cv_scores(pos=0, scores=dummy_scores, evaluated_scores_mean=evaluated_scores_mean, evaluated_scores_std=evaluated_scores_std) + if top_predictors_search_method == OptimumSearch.ModelBasedHeuristic: + Optimizer.submit_evaluations(candidates=[0], evaluations=[np.array(dummy_scores).mean()], durations=[None]) + + scores_mean, scores_std = store_averaged_cv_scores( + pos=len(current_features), scores=scores, evaluated_scores_mean=evaluated_scores_mean, evaluated_scores_std=evaluated_scores_std + ) + if top_predictors_search_method == OptimumSearch.ModelBasedHeuristic: + Optimizer.submit_evaluations(candidates=[len(current_features)], evaluations=[scores_mean], durations=[None]) + + if verbose: + logger.info( + f"trying nfeatures={len(current_features)}, score={scores_mean:.6f}, len(train_index)={len(train_index)}, len(test_index)={len(test_index)}" + ) + + # ---------------------------------------------------------------------------------------------------------------------------- + # Checking exit conditions + # ---------------------------------------------------------------------------------------------------------------------------- + + nsteps += 1 + + if max_runtime_mins and not ran_out_of_time: + delta = timer() - start_time + ran_out_of_time = delta > max_runtime_mins * 60 + if ran_out_of_time: + if verbose: + logger.info(f"max_runtime_mins={max_runtime_mins:_.1f} reached.") + break + + if max_refits and nsteps >= max_refits: + if verbose: + logger.info(f"max_refits={max_refits:_} reached.") + break + + if scores_mean >= best_score: + best_score = scores_mean + n_noimproving_iters = 0 + else: + n_noimproving_iters += 1 + + if best_desired_score is not None and scores_mean >= best_desired_score: + if verbose: + logger.info(f"best_desired_score {best_desired_score:_.6f} reached.") + break + + if best_desired_score is not None and scores_mean <= best_desired_score: + if verbose: + logger.info(f"best_desired_score {best_desired_score:_.6f} reached.") + break + + if max_noimproving_iters and n_noimproving_iters >= max_noimproving_iters: + if verbose: + logger.info(f"Max # of noimproved iters reached: {n_noimproving_iters}") + break + + # ---------------------------------------------------------------------------------------------------------------------------- + # Saving best result found so far as final + # ---------------------------------------------------------------------------------------------------------------------------- + + self.n_features_in_ = X.shape[1] + self.feature_names_in_ = X.columns if isinstance(X, pd.DataFrame) else map(str, np.arange(self.n_features_in_)) + + self.estimators_ = fitted_estimators # a dict with key=nfeatures_nfold + self.feature_importances_ = feature_importances # a dict with key=nfeatures_nfold + self.selected_features_ = selected_features_per_nfeatures # a dict with key=nfeatures + + checked_nfeatures = sorted(evaluated_scores_mean.keys()) + cv_std_perf = [evaluated_scores_std[n] for n in checked_nfeatures] + cv_mean_perf = [evaluated_scores_mean[n] for n in checked_nfeatures] + self.cv_results_ = {"nfeatures": checked_nfeatures, "cv_mean_perf": cv_mean_perf, "cv_std_perf": cv_std_perf} + + self.select_optimal_nfeatures_( + checked_nfeatures=checked_nfeatures, + cv_mean_perf=cv_mean_perf, + cv_std_perf=cv_std_perf, + mean_perf_weight=self.mean_perf_weight, + std_perf_weight=self.std_perf_weight, + feature_cost=feature_cost, + smooth_perf=smooth_perf, + use_all_fi_runs=use_all_fi_runs, + use_last_fi_run_only=use_last_fi_run_only, + use_one_freshest_fi_run=use_one_freshest_fi_run, + use_fi_ranking=use_fi_ranking, + votes_aggregation_method=votes_aggregation_method, + verbose=verbose, + show_plot=show_plot, + ) + + self.signature = signature + return self + + def select_optimal_nfeatures_( + self, + checked_nfeatures: np.ndarray, + cv_mean_perf: np.ndarray, + cv_std_perf: np.ndarray, + mean_perf_weight: float = 1.0, + std_perf_weight: float = 0.1, + feature_cost: float = 0.01 / 100, + smooth_perf: int = 3, + use_all_fi_runs: bool = True, + use_last_fi_run_only: bool = False, + use_one_freshest_fi_run: bool = False, + use_fi_ranking: bool = False, + votes_aggregation_method: VotesAggregation = VotesAggregation.Borda, + verbose: bool = False, + show_plot: bool = False, + plot_file=None, + font_size: int = 12, + figsize: tuple = (10, 7), + ): + + base_perf = np.array(cv_mean_perf) * mean_perf_weight - np.array(cv_std_perf) * std_perf_weight + if smooth_perf: + smoothed_perf = pd.Series(base_perf).rolling(smooth_perf, center=True).mean().values + idx = np.isnan(smoothed_perf) + smoothed_perf[idx] = base_perf[idx] + base_perf = smoothed_perf + + # ultimate_perf = base_perf / (np.log1p(np.arange(len(base_perf))) + comparison_base) + ultimate_perf = base_perf - np.arange(len(base_perf)) * feature_cost + + best_idx = np.argmax(ultimate_perf) + best_top_n = checked_nfeatures[best_idx] + + if show_plot or plot_file: + plt.rcParams.update({"font.size": font_size}) + fig, ax1 = plt.subplots(figsize=figsize) + ax2 = ax1.twinx() + + ax1.set_xlabel("Number of features selected") + ax1.set_ylabel("Mean CV score", c="b") + + ax1.errorbar(checked_nfeatures, cv_mean_perf, yerr=cv_std_perf, c="b", alpha=0.4) + + ax2.plot(checked_nfeatures, ultimate_perf, c="g") + ax1.plot(checked_nfeatures[best_idx], base_perf[best_idx], "ro") + ax2.set_ylabel("Adj CV score", c="g") + + plt.title("Performance by nfeatures") + plt.tight_layout() + + if plot_file: + plt.savefig(plot_file) + if show_plot: + plt.show() + + # after making a cutoff decision: + + self.n_features_ = best_top_n + + # last time vote for feature_importances using all info up to date + + fi_to_consider = select_appropriate_feature_importances( + feature_importances=self.feature_importances_, + nfeatures=best_top_n, + n_original_features=self.n_features_in_, + use_all_fi_runs=use_all_fi_runs, + use_last_fi_run_only=use_last_fi_run_only, + use_one_freshest_fi_run=use_one_freshest_fi_run, + use_fi_ranking=use_fi_ranking, + ) + + self.ranking_ = get_actual_features_ranking( + feature_importances=fi_to_consider, + votes_aggregation_method=votes_aggregation_method, + ) + + self.support_ = np.array([(i in self.ranking_[:best_top_n]) for i in self.feature_names_in_]) + + if verbose: + dummy_gain = base_perf[0] / base_perf[best_idx] - 1 + allfeat_gain = base_perf[-1] / base_perf[best_idx] - 1 + logger.info( + f"{self.n_features_:_} predictive factors selected out of {self.n_features_in_:_} during {len(self.selected_features_):_} rounds. Gain vs dummy={dummy_gain*100:.1f}%, gain vs all features={allfeat_gain*100:.1f}%" + ) + + def transform(self, X, y=None): + if isinstance(X, pd.DataFrame): + return X.iloc[:, self.support_] + else: + return X[:, self.support_] + + +def split_into_train_test( + X: Union[pd.DataFrame, np.ndarray], y: Union[pd.DataFrame, np.ndarray], train_index: np.ndarray, test_index: np.ndarray, features_indices: np.ndarray = None +) -> tuple: + """Split X & y according to indices & dtypes. Basically this accounts for diffeent dtypes (pd.DataFrame, np.ndarray) to perform the same.""" + + if isinstance(X, pd.DataFrame): + X_train, y_train = (X.iloc[train_index, :] if features_indices is None else X.iloc[train_index, :][features_indices]), ( + y.iloc[train_index, :] if isinstance(y, pd.DataFrame) else (y.iloc[train_index] if isinstance(y, pd.Series) else y[train_index]) + ) + X_test, y_test = (X.iloc[test_index, :] if features_indices is None else X.iloc[test_index, :][features_indices]), ( + y.iloc[test_index, :] if isinstance(y, pd.DataFrame) else (y.iloc[test_index] if isinstance(y, pd.Series) else y[test_index]) + ) + else: + X_train, y_train = (X[train_index, :] if features_indices is None else X[train_index, :][:, features_indices]), ( + y[train_index, :] if len(y.shape) > 1 else y[train_index] + ) + X_test, y_test = (X[test_index, :] if features_indices is None else X[test_index, :][:, features_indices]), ( + y[test_index, :] if len(y.shape) > 1 else y[test_index] + ) + + return X_train, y_train, X_test, y_test + + +def store_averaged_cv_scores(pos: int, scores: list, evaluated_scores_mean: dict, evaluated_scores_std: dict) -> None: + scores = np.array(scores) + scores_mean, scores_std = np.mean(scores), np.std(scores) + evaluated_scores_mean[pos] = scores_mean + evaluated_scores_std[pos] = scores_std + + return scores_mean, scores_std + + +def get_feature_importances( + model: object, + current_features: list, + importance_getter: Union[str, Callable], + data: Union[pd.DataFrame, np.ndarray, None] = None, + reference_data: Union[pd.DataFrame, np.ndarray, None] = None, +) -> dict: + + if isinstance(importance_getter, str): + res = getattr(model, importance_getter) + if importance_getter == "coef_": + res = np.abs(res) + if res.ndim > 1: + res = res.sum(axis=0) + else: + res = importance_getter(model=model, data=data, reference_data=reference_data) + + assert len(res) == len(current_features) + return {feature_index: feature_importance for feature_index, feature_importance in zip(current_features, res)} + + +def get_next_features_subset( + nsteps: int, + original_features: list, + feature_importances: pd.DataFrame, + evaluated_scores_mean: dict, + evaluated_scores_std: dict, + use_all_fi_runs: bool, + use_last_fi_run_only: bool, + use_one_freshest_fi_run: bool, + use_fi_ranking: bool, + top_predictors_search_method: OptimumSearch = OptimumSearch.ScipyLocal, + votes_aggregation_method: VotesAggregation = VotesAggregation.Borda, + Optimizer: object = None, +) -> list: + """Generates a "next_nfeatures_to_check" candidate to evaluate. + Decides on a subset of FIs to use (all, freshest preceeding, all preceeding). + Combines FIs from different runs into ranks using voting. + Selects next_nfeatures_to_check best ranked features as candidates for the upcoming FI evaluation. + The whole idea of this approach is that we don't need to go all the way from len(original_features) up to 0 and evaluate + EVERY nfeatures. for 10k features and 1TB datast it's a waste. + """ + + # ---------------------------------------------------------------------------------------------------------------------------- + # First step is to try all features. + # ---------------------------------------------------------------------------------------------------------------------------- + + if nsteps == 0: + return original_features + else: + remaining = list(set(np.arange(1, len(original_features))) - set(evaluated_scores_mean.keys())) + if len(remaining) == 0: + return [] + else: + + if top_predictors_search_method == OptimumSearch.ExhaustiveRandom: + next_nfeatures_to_check = random.choice(remaining) + elif top_predictors_search_method == OptimumSearch.ModelBasedHeuristic: + next_nfeatures_to_check = Optimizer.suggest_candidate() + + if next_nfeatures_to_check is not None: + + # ----------------------------- ----------------------------------------------------------------------------------------------- + # At each step, feature importances must be recalculated in light of recent training on a smaller subset. + # The features already thrown away all receive constant importance update of the same size, to keep up with number of trains (?) + # ---------------------------------------------------------------------------------------------------------------------------- + + fi_to_consider = select_appropriate_feature_importances( + feature_importances=feature_importances, + nfeatures=next_nfeatures_to_check, + n_original_features=len(original_features), + use_all_fi_runs=use_all_fi_runs, + use_last_fi_run_only=use_last_fi_run_only, + use_one_freshest_fi_run=use_one_freshest_fi_run, + use_fi_ranking=use_fi_ranking, + ) + ranks = get_actual_features_ranking(feature_importances=fi_to_consider, votes_aggregation_method=votes_aggregation_method) + # print(f"fi_to_consider={fi_to_consider}") + # print(f"ranks={ranks}") + # print(f"next_nfeatures_to_check={next_nfeatures_to_check}, features chosen={ranks[:next_nfeatures_to_check]}") + + return ranks[:next_nfeatures_to_check] + else: + return [] + + +def select_appropriate_feature_importances( + feature_importances: dict, + nfeatures: int, + n_original_features: int, + use_all_fi_runs: bool = True, + use_last_fi_run_only: bool = False, + use_one_freshest_fi_run: bool = False, + use_fi_ranking: bool = False, +) -> dict: + + if use_last_fi_run_only: + # use train folds with specific length. key is nfeatures_nfold + fi_to_consider = {key: value for key, value in feature_importances.items() if len(value) == n_original_features} + else: + if use_all_fi_runs: + # use all fi data collected so far + fi_to_consider = {key: value for key, value in feature_importances.items() if len(value) > 1} if n_original_features > 1 else feature_importances + else: + # can only use runs preceeding nfeatures here. + if use_one_freshest_fi_run: + # freshest preceeding + fi_to_consider = {} + for possible_nfeatures in range(nfeatures + 1, n_original_features): + for key, value in feature_importances.items(): + if len(value) == possible_nfeatures: + + fi_to_consider[key] = value + if fi_to_consider: + print(f"using freshest FI of {possible_nfeatures} features for nfeatures={nfeatures}") + break + else: + # all preceeding + fi_to_consider = {key: value for key, value in feature_importances.items() if (len(value) > nfeatures and len(value) != 1)} + if use_fi_ranking: + fi_to_consider = {key: pd.Series(value).rank(ascending=True, pct=True).to_dict() for key, value in fi_to_consider.items()} + return fi_to_consider + + +def get_actual_features_ranking(feature_importances: dict, votes_aggregation_method: VotesAggregation) -> list: + """Absolute FIs from estimators trained on CV for each nfeatures are stored separatly. + They can be used to recompute final voted importances using any desired voting algo. + But of course the exploration path was already lead by specific voting algo active at the fitting time. + + GM, and esp Minimax & Plurality are suboptimal for FS. + """ + + from votenrank import Leaderboard + + lb = Leaderboard(table=pd.DataFrame(feature_importances)) + if votes_aggregation_method == VotesAggregation.Borda: + ranks = lb.borda_ranking() + elif votes_aggregation_method == VotesAggregation.AM: + ranks = lb.mean_ranking(mean_type="arithmetic") + elif votes_aggregation_method == VotesAggregation.GM: + ranks = lb.mean_ranking(mean_type="geometric") + elif votes_aggregation_method == VotesAggregation.Copeland: + ranks = lb.copeland_ranking() + elif votes_aggregation_method == VotesAggregation.Dowdall: + ranks = lb.dowdall_ranking() + elif votes_aggregation_method == VotesAggregation.Minimax: + ranks = lb.minimax_ranking() + elif votes_aggregation_method == VotesAggregation.OG: + ranks = lb.optimality_gap_ranking(gamma=1) + elif votes_aggregation_method == VotesAggregation.Plurality: + ranks = lb.plurality_ranking() + + # print("Current features ranks:") + # print(ranks) + return ranks.index.values.tolist() diff --git a/helpers.py b/mlframe/helpers.py similarity index 100% rename from helpers.py rename to mlframe/helpers.py diff --git a/inference.py b/mlframe/inference.py similarity index 100% rename from inference.py rename to mlframe/inference.py diff --git a/keras.py b/mlframe/keras.py similarity index 100% rename from keras.py rename to mlframe/keras.py diff --git a/lightninglib.py b/mlframe/lightninglib.py similarity index 100% rename from lightninglib.py rename to mlframe/lightninglib.py diff --git a/metrics.py b/mlframe/metrics.py similarity index 100% rename from metrics.py rename to mlframe/metrics.py diff --git a/mlflowlib.py b/mlframe/mlflowlib.py similarity index 100% rename from mlflowlib.py rename to mlframe/mlflowlib.py diff --git a/model_selection.py b/mlframe/model_selection.py similarity index 100% rename from model_selection.py rename to mlframe/model_selection.py diff --git a/optimization.py b/mlframe/optimization.py similarity index 100% rename from optimization.py rename to mlframe/optimization.py diff --git a/outliers.py b/mlframe/outliers.py similarity index 100% rename from outliers.py rename to mlframe/outliers.py diff --git a/pipelines.py b/mlframe/pipelines.py similarity index 100% rename from pipelines.py rename to mlframe/pipelines.py diff --git a/preprocessing.py b/mlframe/preprocessing.py similarity index 100% rename from preprocessing.py rename to mlframe/preprocessing.py diff --git a/public_suffix_list.dat b/mlframe/public_suffix_list.dat similarity index 100% rename from public_suffix_list.dat rename to mlframe/public_suffix_list.dat diff --git a/stats.py b/mlframe/stats.py similarity index 100% rename from stats.py rename to mlframe/stats.py diff --git a/synthetic.py b/mlframe/synthetic.py similarity index 100% rename from synthetic.py rename to mlframe/synthetic.py diff --git a/tests.py b/mlframe/tests.py similarity index 100% rename from tests.py rename to mlframe/tests.py diff --git a/text.py b/mlframe/text.py similarity index 100% rename from text.py rename to mlframe/text.py diff --git a/tuning.py b/mlframe/tuning.py similarity index 100% rename from tuning.py rename to mlframe/tuning.py diff --git a/unittest_arrays.py b/mlframe/unittest_arrays.py similarity index 100% rename from unittest_arrays.py rename to mlframe/unittest_arrays.py diff --git a/utils.py b/mlframe/utils.py similarity index 100% rename from utils.py rename to mlframe/utils.py diff --git a/version.py b/mlframe/version.py similarity index 100% rename from version.py rename to mlframe/version.py