No commit message

fingoldo · May 12, 2024 · eb80b91 · eb80b91
1 parent c72dd9a
commit eb80b91
Show file tree

Hide file tree

Showing 54 changed files with 6,481 additions and 0 deletions.
diff --git a/Backtesting.py → mlframe/Backtesting.py b/Backtesting.py → mlframe/Backtesting.py
diff --git a/Data.py → mlframe/Data.py b/Data.py → mlframe/Data.py
diff --git a/FeatureEngineering.py → mlframe/FeatureEngineering.py b/FeatureEngineering.py → mlframe/FeatureEngineering.py
diff --git a/Features.py → mlframe/Features.py b/Features.py → mlframe/Features.py
diff --git a/LICENSE → mlframe/LICENSE b/LICENSE → mlframe/LICENSE
diff --git a/Models.py → mlframe/Models.py b/Models.py → mlframe/Models.py
diff --git a/OldEnsembling.py → mlframe/OldEnsembling.py b/OldEnsembling.py → mlframe/OldEnsembling.py
diff --git a/__init__.py → mlframe/__init__.py b/__init__.py → mlframe/__init__.py
diff --git a/arrays.py → mlframe/arrays.py b/arrays.py → mlframe/arrays.py
diff --git a/baselines.py → mlframe/baselines.py b/baselines.py → mlframe/baselines.py
diff --git a/boruta_shap.py → mlframe/boruta_shap.py b/boruta_shap.py → mlframe/boruta_shap.py
diff --git a/calibration.py → mlframe/calibration.py b/calibration.py → mlframe/calibration.py
diff --git a/cluster.py → mlframe/cluster.py b/cluster.py → mlframe/cluster.py
diff --git a/config.py → mlframe/config.py b/config.py → mlframe/config.py
diff --git a/core.py → mlframe/core.py b/core.py → mlframe/core.py
diff --git a/custom_estimators.py → mlframe/custom_estimators.py b/custom_estimators.py → mlframe/custom_estimators.py
diff --git a/eda.py → mlframe/eda.py b/eda.py → mlframe/eda.py
diff --git a/ensembling.py → mlframe/ensembling.py b/ensembling.py → mlframe/ensembling.py
diff --git a/estimators.py → mlframe/estimators.py b/estimators.py → mlframe/estimators.py
diff --git a/evaluation.py → mlframe/evaluation.py b/evaluation.py → mlframe/evaluation.py
diff --git a/ewma.py → mlframe/ewma.py b/ewma.py → mlframe/ewma.py
diff --git a/experiments.py → mlframe/experiments.py b/experiments.py → mlframe/experiments.py
diff --git a/explainability.py → mlframe/explainability.py b/explainability.py → mlframe/explainability.py
diff --git a/feature_cleaning.py → mlframe/feature_cleaning.py b/feature_cleaning.py → mlframe/feature_cleaning.py
diff --git a/mlframe/feature_engineering/__init__.py b/mlframe/feature_engineering/__init__.py
diff --git a/mlframe/feature_engineering/basic.py b/mlframe/feature_engineering/basic.py
@@ -0,0 +1,57 @@
+"""Basic feature engineering for ML."""
+
+# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# LOGGING
+# ----------------------------------------------------------------------------------------------------------------------------
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# Packages
+# ----------------------------------------------------------------------------------------------------------------------------
+
+from pyutilz.pythonlib import ensure_installed  # lint: disable=ungrouped-imports,disable=wrong-import-order
+
+# ensure_installed("numpy pandas")
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# Normal Imports
+# ----------------------------------------------------------------------------------------------------------------------------
+
+from typing import *
+
+import warnings
+
+warnings.simplefilter(action="ignore", category=FutureWarning)
+import pandas as pd, numpy as np
+
+
+def create_date_features(
+    df: pd.DataFrame,
+    cols: list,
+    delete_original_cols: bool = True,
+    bulk: bool = False,
+    methods: dict = {"day": np.int8, "weekday": np.int8, "month": np.int8}, # "week": np.int8, #, "quarter": np.int8 #  , "year": np.int16
+) -> pd.DataFrame:
+    if len(cols) == 0:
+        return
+
+    if bulk:
+        tmp = {}
+    else:
+        tmp = df
+    for col in cols:
+        for method, dtype in methods.items():
+            tmp[col + "_" + method] = getattr(df[col].dt, method).astype(dtype)
+
+    if delete_original_cols:
+        df.drop(columns=cols, inplace=True)
+
+    if bulk:
+        df = pd.concat([df, pd.DataFrame(tmp)], axis=1)
+
+    return df
diff --git a/mlframe/feature_engineering/categorical.py b/mlframe/feature_engineering/categorical.py
@@ -0,0 +1,126 @@
+"""Categorical feature engineering for ML. Optimized & rich set of aggregates for 1d vectors."""
+
+# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# LOGGING
+# ----------------------------------------------------------------------------------------------------------------------------
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# Packages
+# ----------------------------------------------------------------------------------------------------------------------------
+
+from pyutilz.pythonlib import (
+    ensure_installed,
+)  # lint: disable=ungrouped-imports,disable=wrong-import-order
+
+# ensure_installed("numpy pandas scipy")
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# Normal Imports
+# ----------------------------------------------------------------------------------------------------------------------------
+
+from typing import *
+
+import warnings
+
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
+from antropy import *
+import pandas as pd, numpy as np
+from scipy.stats import entropy
+from .numerical import compute_numaggs, get_numaggs_names
+
+import warnings
+
+warnings.filterwarnings("ignore", message="nperseg =")
+
+numaggs_names = get_numaggs_names()
+directional_numaggs_names = get_numaggs_names(directional_only=True)
+
+
+def compute_countaggs(
+    arr: pd.Series,
+    counts_normalize: bool = True,  # use relative or absolute counts
+    counts_compute_numaggs: bool = True,  # compute numerical aggregates over counts data or not
+    counts_top_n: int = 1,  # return that many highest/lowest value counts
+    counts_return_top_counts: bool = True,  # return top counts
+    counts_return_top_values: bool = True,  # return top values
+    counts_compute_values_numaggs: bool = False,  # if all values are in fact numerical, compute numaggs for them rather than their counts (ordered only, in order of their counts)
+    numerical_kwargs: dict = dict(return_unsorted_stats=False),
+):
+    """For some variables, especially with many repeated values, or categorical, we can do value_counts(normalize=True or False). Further we can return
+    1) Top N highest/lowest values along with their counts (missing are padded with NaNs)
+    2) numaggs over counts data
+    3) if variable is numeric, numaggs(timeseries_features=True) for values series sorted by counts (timeseries_features=True leaves only aggregates depending on the order of values,
+        'cause otherwise it's simply a duplication of num_aggs over regular series)
+    """
+    value_counts = arr.value_counts(normalize=counts_normalize)
+    value_counts=value_counts[value_counts > 0]
+    values = value_counts.index.values
+    counts = value_counts.values
+
+    res = []
+
+    if counts_compute_numaggs:
+        res.extend(compute_numaggs(arr=counts, **numerical_kwargs))
+
+    if counts_top_n:
+
+        if len(counts) >= counts_top_n:
+            extra = []
+        else:
+            extra = [np.nan] * (counts_top_n - len(counts))
+
+        if counts_return_top_counts:
+            res.extend(counts[:counts_top_n].tolist() + extra)
+            res.extend(extra + counts[-counts_top_n:].tolist())
+        if counts_return_top_values:
+            res.extend(values[:counts_top_n].tolist() + extra)
+            res.extend(extra + values[-counts_top_n:].tolist())
+
+    if counts_compute_values_numaggs:
+        if pd.api.types.is_numeric_dtype(values):
+            processed_numerical_kwargs = numerical_kwargs.copy()
+            processed_numerical_kwargs["directional_only"] = True
+            res.extend(compute_numaggs(arr=values, **processed_numerical_kwargs))
+        else:
+            res.extend([np.nan] * len(directional_numaggs_names))
+
+    return res
+
+
+def get_countaggs_names(
+    counts_normalize: bool = True,  # use relative or absolute counts
+    counts_compute_numaggs: bool = True,  # compute numerical aggregates over counts data or not
+    counts_top_n: int = 1,  # return that many highest/lowest value counts
+    counts_return_top_counts: bool = True,  # return top counts
+    counts_return_top_values: bool = True,  # return top values
+    counts_compute_values_numaggs: bool = True,  # if all values are in fact numerical, compute numaggs for them rather than their counts (ordered only, in order of their counts)
+    numerical_kwargs: dict = dict(return_unsorted_stats=False),
+) -> list:
+
+    res = []
+
+    if counts_compute_numaggs:
+        res.extend([feat + "_" + ("cntnrm" if counts_normalize else "cnt") for feat in get_numaggs_names(**numerical_kwargs)])
+
+    if counts_top_n:
+        if counts_return_top_counts:
+            res.extend(["top_" + str(i + 1) + "_vcnt" for i in range(counts_top_n)])
+            res.extend(["btm_" + str(counts_top_n - i) + "_vcnt" for i in range(counts_top_n)])
+        if counts_return_top_values:
+            res.extend(["top_" + str(i + 1) + "_vval" for i in range(counts_top_n)])
+            res.extend(["btm_" + str(counts_top_n - i) + "_vval" for i in range(counts_top_n)])
+
+    if counts_compute_values_numaggs:
+        # if pd.api.types.is_numeric_dtype(values):
+        processed_numerical_kwargs = numerical_kwargs.copy()
+        processed_numerical_kwargs["directional_only"] = True
+        res.extend([feat + "_vvls" for feat in directional_numaggs_names])
+
+    return res
diff --git a/mlframe/feature_engineering/hurst.py b/mlframe/feature_engineering/hurst.py
@@ -0,0 +1,124 @@
+"""Compute the Hurst Exponent of an 1D array by the means of R/S analisys:
+    
+    https://en.wikipedia.org/wiki/Hurst_exponent
+"""
+
+# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# LOGGING
+# ----------------------------------------------------------------------------------------------------------------------------
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# Packages
+# ----------------------------------------------------------------------------------------------------------------------------
+
+from pyutilz.pythonlib import (
+    ensure_installed,
+)  # lint: disable=ungrouped-imports,disable=wrong-import-order
+
+ensure_installed("numpy pandas numba scipy sklearn antropy")
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# Normal Imports
+# ----------------------------------------------------------------------------------------------------------------------------
+
+from typing import *
+import numpy as np
+from numba import njit
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# Inits
+# ----------------------------------------------------------------------------------------------------------------------------
+
+fastmath = False
+
+# ----------------------------------------------------------------------------------------------------------------------------
+# Core funcs
+# ----------------------------------------------------------------------------------------------------------------------------
+
+
+@njit(fastmath=fastmath)
+def compute_hurst_rs(arr: np.ndarray, agg_func: object = np.mean):
+    """Computes R/S stat for a single window."""
+
+    mean = agg_func(arr)
+
+    deviations = arr - mean
+    Z = np.cumsum(deviations)
+    R = np.max(Z) - np.min(Z)
+    S = np.std(arr)  # , ddof=1
+
+    if R == 0 or S == 0:
+        return 0.0  # to skip this interval due the undefined R/S ratio
+
+    return R / S
+
+
+@njit(fastmath=fastmath)
+def precompute_hurst_exponent(
+    arr: np.ndarray, min_window: int = 5, max_window: int = None, windows_log_step: float = 0.25, take_diffs: bool = True, agg_func: object = np.mean
+):
+    """Computes R/S stat for a single window."""
+
+    # Get diffs, if needed
+
+    if take_diffs:
+        arr = arr[1:] - arr[:-1]
+
+    L = len(arr)
+
+    # Split parent array several times into a number of equal chunks, increasing the chunk length
+
+    max_window = max_window or (L - 1)
+    window_sizes = (10 ** np.arange(np.log10(min_window), np.log10(max_window), windows_log_step)).astype(np.int32)
+    # window_sizes.append(L)
+
+    RS = []
+    used_window_sizes = []
+    for w in window_sizes:
+        rs = []
+        for start in range(0, L, w):
+            if (start + w) >= L:
+                break
+            partial_rs = compute_hurst_rs(arr[start : start + w])  # , agg_func=agg_func)
+            if partial_rs:
+                rs.append(partial_rs)
+        if rs:
+            RS.append(agg_func(np.array(rs)))
+            used_window_sizes.append(w)
+
+    return used_window_sizes, RS
+
+
+def compute_hurst_exponent(arr: np.ndarray, min_window: int = 5, max_window: int = None, windows_log_step: float = 0.25, take_diffs: bool = False)->tuple:
+    """Main enrtypoint to compute a Hurst Exponent (and the constant) of a numerical array."""
+    if len(arr) < min_window:
+        return np.nan, np.nan
+    window_sizes, rs = precompute_hurst_exponent(
+        arr=arr, min_window=min_window, max_window=max_window, windows_log_step=windows_log_step, take_diffs=take_diffs
+    )
+    x = np.vstack([np.log10(window_sizes), np.ones(len(rs))]).T
+    h, c = np.linalg.lstsq(x, np.log10(rs), rcond=-1)[0]
+    c = 10**c
+    return h, c
+
+
+def hurst_testing():
+
+    # pip install hurst
+
+    from hurst import random_walk
+
+    brownian = random_walk(1000, proba=0.5)
+    print(compute_hurst_exponent(np.array(brownian)))
+
+    persistent = random_walk(1000, proba=0.7)
+    print(compute_hurst_exponent(np.array(persistent)))
+
+    antipersistent = random_walk(1000, proba=0.3)
+    print(compute_hurst_exponent(np.array(antipersistent)))