Skip to content

Commit

Permalink
No commit message
Browse files Browse the repository at this point in the history
  • Loading branch information
fingoldo committed May 12, 2024
1 parent c72dd9a commit eb80b91
Show file tree
Hide file tree
Showing 54 changed files with 6,481 additions and 0 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file.
57 changes: 57 additions & 0 deletions mlframe/feature_engineering/basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Basic feature engineering for ML."""

# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement

# ----------------------------------------------------------------------------------------------------------------------------
# LOGGING
# ----------------------------------------------------------------------------------------------------------------------------

import logging

logger = logging.getLogger(__name__)

# ----------------------------------------------------------------------------------------------------------------------------
# Packages
# ----------------------------------------------------------------------------------------------------------------------------

from pyutilz.pythonlib import ensure_installed # lint: disable=ungrouped-imports,disable=wrong-import-order

# ensure_installed("numpy pandas")

# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
# ----------------------------------------------------------------------------------------------------------------------------

from typing import *

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
import pandas as pd, numpy as np


def create_date_features(
df: pd.DataFrame,
cols: list,
delete_original_cols: bool = True,
bulk: bool = False,
methods: dict = {"day": np.int8, "weekday": np.int8, "month": np.int8}, # "week": np.int8, #, "quarter": np.int8 # , "year": np.int16
) -> pd.DataFrame:
if len(cols) == 0:
return

if bulk:
tmp = {}
else:
tmp = df
for col in cols:
for method, dtype in methods.items():
tmp[col + "_" + method] = getattr(df[col].dt, method).astype(dtype)

if delete_original_cols:
df.drop(columns=cols, inplace=True)

if bulk:
df = pd.concat([df, pd.DataFrame(tmp)], axis=1)

return df
126 changes: 126 additions & 0 deletions mlframe/feature_engineering/categorical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""Categorical feature engineering for ML. Optimized & rich set of aggregates for 1d vectors."""

# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement

# ----------------------------------------------------------------------------------------------------------------------------
# LOGGING
# ----------------------------------------------------------------------------------------------------------------------------

import logging

logger = logging.getLogger(__name__)

# ----------------------------------------------------------------------------------------------------------------------------
# Packages
# ----------------------------------------------------------------------------------------------------------------------------

from pyutilz.pythonlib import (
ensure_installed,
) # lint: disable=ungrouped-imports,disable=wrong-import-order

# ensure_installed("numpy pandas scipy")

# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
# ----------------------------------------------------------------------------------------------------------------------------

from typing import *

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

from antropy import *
import pandas as pd, numpy as np
from scipy.stats import entropy
from .numerical import compute_numaggs, get_numaggs_names

import warnings

warnings.filterwarnings("ignore", message="nperseg =")

numaggs_names = get_numaggs_names()
directional_numaggs_names = get_numaggs_names(directional_only=True)


def compute_countaggs(
arr: pd.Series,
counts_normalize: bool = True, # use relative or absolute counts
counts_compute_numaggs: bool = True, # compute numerical aggregates over counts data or not
counts_top_n: int = 1, # return that many highest/lowest value counts
counts_return_top_counts: bool = True, # return top counts
counts_return_top_values: bool = True, # return top values
counts_compute_values_numaggs: bool = False, # if all values are in fact numerical, compute numaggs for them rather than their counts (ordered only, in order of their counts)
numerical_kwargs: dict = dict(return_unsorted_stats=False),
):
"""For some variables, especially with many repeated values, or categorical, we can do value_counts(normalize=True or False). Further we can return
1) Top N highest/lowest values along with their counts (missing are padded with NaNs)
2) numaggs over counts data
3) if variable is numeric, numaggs(timeseries_features=True) for values series sorted by counts (timeseries_features=True leaves only aggregates depending on the order of values,
'cause otherwise it's simply a duplication of num_aggs over regular series)
"""
value_counts = arr.value_counts(normalize=counts_normalize)
value_counts=value_counts[value_counts > 0]
values = value_counts.index.values
counts = value_counts.values

res = []

if counts_compute_numaggs:
res.extend(compute_numaggs(arr=counts, **numerical_kwargs))

if counts_top_n:

if len(counts) >= counts_top_n:
extra = []
else:
extra = [np.nan] * (counts_top_n - len(counts))

if counts_return_top_counts:
res.extend(counts[:counts_top_n].tolist() + extra)
res.extend(extra + counts[-counts_top_n:].tolist())
if counts_return_top_values:
res.extend(values[:counts_top_n].tolist() + extra)
res.extend(extra + values[-counts_top_n:].tolist())

if counts_compute_values_numaggs:
if pd.api.types.is_numeric_dtype(values):
processed_numerical_kwargs = numerical_kwargs.copy()
processed_numerical_kwargs["directional_only"] = True
res.extend(compute_numaggs(arr=values, **processed_numerical_kwargs))
else:
res.extend([np.nan] * len(directional_numaggs_names))

return res


def get_countaggs_names(
counts_normalize: bool = True, # use relative or absolute counts
counts_compute_numaggs: bool = True, # compute numerical aggregates over counts data or not
counts_top_n: int = 1, # return that many highest/lowest value counts
counts_return_top_counts: bool = True, # return top counts
counts_return_top_values: bool = True, # return top values
counts_compute_values_numaggs: bool = True, # if all values are in fact numerical, compute numaggs for them rather than their counts (ordered only, in order of their counts)
numerical_kwargs: dict = dict(return_unsorted_stats=False),
) -> list:

res = []

if counts_compute_numaggs:
res.extend([feat + "_" + ("cntnrm" if counts_normalize else "cnt") for feat in get_numaggs_names(**numerical_kwargs)])

if counts_top_n:
if counts_return_top_counts:
res.extend(["top_" + str(i + 1) + "_vcnt" for i in range(counts_top_n)])
res.extend(["btm_" + str(counts_top_n - i) + "_vcnt" for i in range(counts_top_n)])
if counts_return_top_values:
res.extend(["top_" + str(i + 1) + "_vval" for i in range(counts_top_n)])
res.extend(["btm_" + str(counts_top_n - i) + "_vval" for i in range(counts_top_n)])

if counts_compute_values_numaggs:
# if pd.api.types.is_numeric_dtype(values):
processed_numerical_kwargs = numerical_kwargs.copy()
processed_numerical_kwargs["directional_only"] = True
res.extend([feat + "_vvls" for feat in directional_numaggs_names])

return res
124 changes: 124 additions & 0 deletions mlframe/feature_engineering/hurst.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""Compute the Hurst Exponent of an 1D array by the means of R/S analisys:
https://en.wikipedia.org/wiki/Hurst_exponent
"""

# pylint: disable=wrong-import-order,wrong-import-position,unidiomatic-typecheck,pointless-string-statement

# ----------------------------------------------------------------------------------------------------------------------------
# LOGGING
# ----------------------------------------------------------------------------------------------------------------------------

import logging

logger = logging.getLogger(__name__)

# ----------------------------------------------------------------------------------------------------------------------------
# Packages
# ----------------------------------------------------------------------------------------------------------------------------

from pyutilz.pythonlib import (
ensure_installed,
) # lint: disable=ungrouped-imports,disable=wrong-import-order

ensure_installed("numpy pandas numba scipy sklearn antropy")

# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
# ----------------------------------------------------------------------------------------------------------------------------

from typing import *
import numpy as np
from numba import njit

# ----------------------------------------------------------------------------------------------------------------------------
# Inits
# ----------------------------------------------------------------------------------------------------------------------------

fastmath = False

# ----------------------------------------------------------------------------------------------------------------------------
# Core funcs
# ----------------------------------------------------------------------------------------------------------------------------


@njit(fastmath=fastmath)
def compute_hurst_rs(arr: np.ndarray, agg_func: object = np.mean):
"""Computes R/S stat for a single window."""

mean = agg_func(arr)

deviations = arr - mean
Z = np.cumsum(deviations)
R = np.max(Z) - np.min(Z)
S = np.std(arr) # , ddof=1

if R == 0 or S == 0:
return 0.0 # to skip this interval due the undefined R/S ratio

return R / S


@njit(fastmath=fastmath)
def precompute_hurst_exponent(
arr: np.ndarray, min_window: int = 5, max_window: int = None, windows_log_step: float = 0.25, take_diffs: bool = True, agg_func: object = np.mean
):
"""Computes R/S stat for a single window."""

# Get diffs, if needed

if take_diffs:
arr = arr[1:] - arr[:-1]

L = len(arr)

# Split parent array several times into a number of equal chunks, increasing the chunk length

max_window = max_window or (L - 1)
window_sizes = (10 ** np.arange(np.log10(min_window), np.log10(max_window), windows_log_step)).astype(np.int32)
# window_sizes.append(L)

RS = []
used_window_sizes = []
for w in window_sizes:
rs = []
for start in range(0, L, w):
if (start + w) >= L:
break
partial_rs = compute_hurst_rs(arr[start : start + w]) # , agg_func=agg_func)
if partial_rs:
rs.append(partial_rs)
if rs:
RS.append(agg_func(np.array(rs)))
used_window_sizes.append(w)

return used_window_sizes, RS


def compute_hurst_exponent(arr: np.ndarray, min_window: int = 5, max_window: int = None, windows_log_step: float = 0.25, take_diffs: bool = False)->tuple:
"""Main enrtypoint to compute a Hurst Exponent (and the constant) of a numerical array."""
if len(arr) < min_window:
return np.nan, np.nan
window_sizes, rs = precompute_hurst_exponent(
arr=arr, min_window=min_window, max_window=max_window, windows_log_step=windows_log_step, take_diffs=take_diffs
)
x = np.vstack([np.log10(window_sizes), np.ones(len(rs))]).T
h, c = np.linalg.lstsq(x, np.log10(rs), rcond=-1)[0]
c = 10**c
return h, c


def hurst_testing():

# pip install hurst

from hurst import random_walk

brownian = random_walk(1000, proba=0.5)
print(compute_hurst_exponent(np.array(brownian)))

persistent = random_walk(1000, proba=0.7)
print(compute_hurst_exponent(np.array(persistent)))

antipersistent = random_walk(1000, proba=0.3)
print(compute_hurst_exponent(np.array(antipersistent)))
Loading

0 comments on commit eb80b91

Please sign in to comment.