-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
54 changed files
with
6,481 additions
and
0 deletions.
There are no files selected for viewing
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
"""Basic feature engineering for ML.""" | ||
|
||
# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement | ||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# LOGGING | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
import logging | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# Packages | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
from pyutilz.pythonlib import ensure_installed # lint: disable=ungrouped-imports,disable=wrong-import-order | ||
|
||
# ensure_installed("numpy pandas") | ||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# Normal Imports | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
from typing import * | ||
|
||
import warnings | ||
|
||
warnings.simplefilter(action="ignore", category=FutureWarning) | ||
import pandas as pd, numpy as np | ||
|
||
|
||
def create_date_features( | ||
df: pd.DataFrame, | ||
cols: list, | ||
delete_original_cols: bool = True, | ||
bulk: bool = False, | ||
methods: dict = {"day": np.int8, "weekday": np.int8, "month": np.int8}, # "week": np.int8, #, "quarter": np.int8 # , "year": np.int16 | ||
) -> pd.DataFrame: | ||
if len(cols) == 0: | ||
return | ||
|
||
if bulk: | ||
tmp = {} | ||
else: | ||
tmp = df | ||
for col in cols: | ||
for method, dtype in methods.items(): | ||
tmp[col + "_" + method] = getattr(df[col].dt, method).astype(dtype) | ||
|
||
if delete_original_cols: | ||
df.drop(columns=cols, inplace=True) | ||
|
||
if bulk: | ||
df = pd.concat([df, pd.DataFrame(tmp)], axis=1) | ||
|
||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
"""Categorical feature engineering for ML. Optimized & rich set of aggregates for 1d vectors.""" | ||
|
||
# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement | ||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# LOGGING | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
import logging | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# Packages | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
from pyutilz.pythonlib import ( | ||
ensure_installed, | ||
) # lint: disable=ungrouped-imports,disable=wrong-import-order | ||
|
||
# ensure_installed("numpy pandas scipy") | ||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# Normal Imports | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
from typing import * | ||
|
||
import warnings | ||
|
||
warnings.simplefilter(action="ignore", category=FutureWarning) | ||
|
||
from antropy import * | ||
import pandas as pd, numpy as np | ||
from scipy.stats import entropy | ||
from .numerical import compute_numaggs, get_numaggs_names | ||
|
||
import warnings | ||
|
||
warnings.filterwarnings("ignore", message="nperseg =") | ||
|
||
numaggs_names = get_numaggs_names() | ||
directional_numaggs_names = get_numaggs_names(directional_only=True) | ||
|
||
|
||
def compute_countaggs( | ||
arr: pd.Series, | ||
counts_normalize: bool = True, # use relative or absolute counts | ||
counts_compute_numaggs: bool = True, # compute numerical aggregates over counts data or not | ||
counts_top_n: int = 1, # return that many highest/lowest value counts | ||
counts_return_top_counts: bool = True, # return top counts | ||
counts_return_top_values: bool = True, # return top values | ||
counts_compute_values_numaggs: bool = False, # if all values are in fact numerical, compute numaggs for them rather than their counts (ordered only, in order of their counts) | ||
numerical_kwargs: dict = dict(return_unsorted_stats=False), | ||
): | ||
"""For some variables, especially with many repeated values, or categorical, we can do value_counts(normalize=True or False). Further we can return | ||
1) Top N highest/lowest values along with their counts (missing are padded with NaNs) | ||
2) numaggs over counts data | ||
3) if variable is numeric, numaggs(timeseries_features=True) for values series sorted by counts (timeseries_features=True leaves only aggregates depending on the order of values, | ||
'cause otherwise it's simply a duplication of num_aggs over regular series) | ||
""" | ||
value_counts = arr.value_counts(normalize=counts_normalize) | ||
value_counts=value_counts[value_counts > 0] | ||
values = value_counts.index.values | ||
counts = value_counts.values | ||
|
||
res = [] | ||
|
||
if counts_compute_numaggs: | ||
res.extend(compute_numaggs(arr=counts, **numerical_kwargs)) | ||
|
||
if counts_top_n: | ||
|
||
if len(counts) >= counts_top_n: | ||
extra = [] | ||
else: | ||
extra = [np.nan] * (counts_top_n - len(counts)) | ||
|
||
if counts_return_top_counts: | ||
res.extend(counts[:counts_top_n].tolist() + extra) | ||
res.extend(extra + counts[-counts_top_n:].tolist()) | ||
if counts_return_top_values: | ||
res.extend(values[:counts_top_n].tolist() + extra) | ||
res.extend(extra + values[-counts_top_n:].tolist()) | ||
|
||
if counts_compute_values_numaggs: | ||
if pd.api.types.is_numeric_dtype(values): | ||
processed_numerical_kwargs = numerical_kwargs.copy() | ||
processed_numerical_kwargs["directional_only"] = True | ||
res.extend(compute_numaggs(arr=values, **processed_numerical_kwargs)) | ||
else: | ||
res.extend([np.nan] * len(directional_numaggs_names)) | ||
|
||
return res | ||
|
||
|
||
def get_countaggs_names( | ||
counts_normalize: bool = True, # use relative or absolute counts | ||
counts_compute_numaggs: bool = True, # compute numerical aggregates over counts data or not | ||
counts_top_n: int = 1, # return that many highest/lowest value counts | ||
counts_return_top_counts: bool = True, # return top counts | ||
counts_return_top_values: bool = True, # return top values | ||
counts_compute_values_numaggs: bool = True, # if all values are in fact numerical, compute numaggs for them rather than their counts (ordered only, in order of their counts) | ||
numerical_kwargs: dict = dict(return_unsorted_stats=False), | ||
) -> list: | ||
|
||
res = [] | ||
|
||
if counts_compute_numaggs: | ||
res.extend([feat + "_" + ("cntnrm" if counts_normalize else "cnt") for feat in get_numaggs_names(**numerical_kwargs)]) | ||
|
||
if counts_top_n: | ||
if counts_return_top_counts: | ||
res.extend(["top_" + str(i + 1) + "_vcnt" for i in range(counts_top_n)]) | ||
res.extend(["btm_" + str(counts_top_n - i) + "_vcnt" for i in range(counts_top_n)]) | ||
if counts_return_top_values: | ||
res.extend(["top_" + str(i + 1) + "_vval" for i in range(counts_top_n)]) | ||
res.extend(["btm_" + str(counts_top_n - i) + "_vval" for i in range(counts_top_n)]) | ||
|
||
if counts_compute_values_numaggs: | ||
# if pd.api.types.is_numeric_dtype(values): | ||
processed_numerical_kwargs = numerical_kwargs.copy() | ||
processed_numerical_kwargs["directional_only"] = True | ||
res.extend([feat + "_vvls" for feat in directional_numaggs_names]) | ||
|
||
return res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
"""Compute the Hurst Exponent of an 1D array by the means of R/S analisys: | ||
https://en.wikipedia.org/wiki/Hurst_exponent | ||
""" | ||
|
||
# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement | ||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# LOGGING | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
import logging | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# Packages | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
from pyutilz.pythonlib import ( | ||
ensure_installed, | ||
) # lint: disable=ungrouped-imports,disable=wrong-import-order | ||
|
||
ensure_installed("numpy pandas numba scipy sklearn antropy") | ||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# Normal Imports | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
from typing import * | ||
import numpy as np | ||
from numba import njit | ||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# Inits | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
fastmath = False | ||
|
||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
# Core funcs | ||
# ---------------------------------------------------------------------------------------------------------------------------- | ||
|
||
|
||
@njit(fastmath=fastmath) | ||
def compute_hurst_rs(arr: np.ndarray, agg_func: object = np.mean): | ||
"""Computes R/S stat for a single window.""" | ||
|
||
mean = agg_func(arr) | ||
|
||
deviations = arr - mean | ||
Z = np.cumsum(deviations) | ||
R = np.max(Z) - np.min(Z) | ||
S = np.std(arr) # , ddof=1 | ||
|
||
if R == 0 or S == 0: | ||
return 0.0 # to skip this interval due the undefined R/S ratio | ||
|
||
return R / S | ||
|
||
|
||
@njit(fastmath=fastmath) | ||
def precompute_hurst_exponent( | ||
arr: np.ndarray, min_window: int = 5, max_window: int = None, windows_log_step: float = 0.25, take_diffs: bool = True, agg_func: object = np.mean | ||
): | ||
"""Computes R/S stat for a single window.""" | ||
|
||
# Get diffs, if needed | ||
|
||
if take_diffs: | ||
arr = arr[1:] - arr[:-1] | ||
|
||
L = len(arr) | ||
|
||
# Split parent array several times into a number of equal chunks, increasing the chunk length | ||
|
||
max_window = max_window or (L - 1) | ||
window_sizes = (10 ** np.arange(np.log10(min_window), np.log10(max_window), windows_log_step)).astype(np.int32) | ||
# window_sizes.append(L) | ||
|
||
RS = [] | ||
used_window_sizes = [] | ||
for w in window_sizes: | ||
rs = [] | ||
for start in range(0, L, w): | ||
if (start + w) >= L: | ||
break | ||
partial_rs = compute_hurst_rs(arr[start : start + w]) # , agg_func=agg_func) | ||
if partial_rs: | ||
rs.append(partial_rs) | ||
if rs: | ||
RS.append(agg_func(np.array(rs))) | ||
used_window_sizes.append(w) | ||
|
||
return used_window_sizes, RS | ||
|
||
|
||
def compute_hurst_exponent(arr: np.ndarray, min_window: int = 5, max_window: int = None, windows_log_step: float = 0.25, take_diffs: bool = False)->tuple: | ||
"""Main enrtypoint to compute a Hurst Exponent (and the constant) of a numerical array.""" | ||
if len(arr) < min_window: | ||
return np.nan, np.nan | ||
window_sizes, rs = precompute_hurst_exponent( | ||
arr=arr, min_window=min_window, max_window=max_window, windows_log_step=windows_log_step, take_diffs=take_diffs | ||
) | ||
x = np.vstack([np.log10(window_sizes), np.ones(len(rs))]).T | ||
h, c = np.linalg.lstsq(x, np.log10(rs), rcond=-1)[0] | ||
c = 10**c | ||
return h, c | ||
|
||
|
||
def hurst_testing(): | ||
|
||
# pip install hurst | ||
|
||
from hurst import random_walk | ||
|
||
brownian = random_walk(1000, proba=0.5) | ||
print(compute_hurst_exponent(np.array(brownian))) | ||
|
||
persistent = random_walk(1000, proba=0.7) | ||
print(compute_hurst_exponent(np.array(persistent))) | ||
|
||
antipersistent = random_walk(1000, proba=0.3) | ||
print(compute_hurst_exponent(np.array(antipersistent))) |
Oops, something went wrong.