Skip to content

Commit

Permalink
HISEL v0.4.0 - Better user interface (#35)
Browse files Browse the repository at this point in the history
* Comprehensive API for selection - to be used in tw-experimentation

* fix hsic search

* categorical tests

* HISEL - v0.4.0 better user interface
  • Loading branch information
claudio-tw authored Jul 10, 2023
1 parent 556dda2 commit 22785d9
Show file tree
Hide file tree
Showing 13 changed files with 360 additions and 257 deletions.
Binary file removed dist/hisel-0.3.0-py3-none-any.whl
Binary file not shown.
Binary file added dist/hisel-0.4.0-py3-none-any.whl
Binary file not shown.
40 changes: 36 additions & 4 deletions hisel/categorical.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
from typing import Optional, Set, Tuple, Callable, Union, List
import itertools
import threading
import sys
import time
import numpy as np
import pandas as pd
from dataclasses import dataclass
from sklearn.metrics import adjusted_mutual_info_score
from joblib import Parallel, delayed
from tqdm import tqdm


from hisel import permutohedron
Expand Down Expand Up @@ -67,10 +72,12 @@ def select(
parallel: bool = False,
random_state: Optional[int] = None,
) -> Selection:
print(f'Number of categorical features: {xdf.shape[1]}')
xdf = _preprocess_datatypes(xdf)
x = xdf.values
ydf = _preprocess_datatypes(ydf)
allfeatures: List[np.ndarray] = []

if isinstance(ydf, pd.Series):
if ydf.dtypes == float:
y = _discretise(ydf.values)
Expand Down Expand Up @@ -105,6 +112,7 @@ def select(
fs = np.concatenate(allfeatures)
indexes = np.array(list(set(fs)), dtype=int)
features = list(xdf.columns[indexes])
print(f'Number of selected categorical features: {len(features)}')
return Selection(indexes=indexes, features=features)


Expand All @@ -124,15 +132,15 @@ def search(
assert x.dtype == int
assert y.dtype == int
if num_permutations is None:
num_permutations = 3 * d
num_permutations = d
x = x - np.amin(x, axis=0, keepdims=True)
y = y - np.amin(y, axis=0, keepdims=True)
active_set = set(range(d))
sel = np.arange(d, dtype=int)
features = np.array([], dtype=int)
imall = .0
n_iter = 0
while len(active_set) > 1 and n_iter < max_iter:
while len(active_set) > 0 and n_iter < max_iter:
active = np.array(list(active_set))
num_active = len(active)
num_haar_samples = min(
Expand All @@ -148,11 +156,11 @@ def search(
tries = Parallel(n_jobs=-1)([
delayed(_try_permutation)(
ami, x, y, active, list(permutation))
for permutation in permutations
for permutation in tqdm(permutations)
])
else:
tries = [_try_permutation(
ami, x, y, active, list(permutation)) for permutation in permutations]
ami, x, y, active, list(permutation)) for permutation in tqdm(permutations)]

im = .0
for im_, sel_ in tries:
Expand All @@ -169,6 +177,16 @@ def search(
features = np.concatenate((features, sel))
active_set = active_set.difference(set(features))
n_iter += 1
threshold = im_ratio * imall
fwsel = _featurewise_selection(
ami,
x,
y,
threshold
)
features = np.array(list(
set(features).union(set(fwsel))
))
return features


Expand Down Expand Up @@ -209,3 +227,17 @@ def _try_permutation(
im = ims[s]
selection = sel[:s+1]
return im, selection


def _featurewise_selection(
metric: Callable[[np.ndarray, np.ndarray], np.ndarray],
x: np.ndarray,
y: np.ndarray,
threshold: float,
) -> List[int]:
sel = []
for i in range(x.shape[1]):
v = metric(x[:, [i]], y)
if v > threshold:
sel.append(i)
return sel
3 changes: 3 additions & 0 deletions hisel/feature_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,12 @@ def select_features(
hsiclasso_parameters = HSICLassoParameters()
if categorical_search_parameters is None:
categorical_search_parameters = SearchParameters()

print("\n***Selection of continuous features***")
continuous_lasso_selection: LassoSelection = select.select(
xdf[continuous_features], ydf, **hsiclasso_parameters)

print("\n***Selection of categorical features***")
categorical_search_selection: categorical.Selection = categorical.select(
xdf[discrete_features], ydf, **categorical_search_parameters)

Expand Down
5 changes: 3 additions & 2 deletions hisel/kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from joblib import Parallel, delayed
from enum import Enum
import numpy as np
from tqdm import tqdm


class KernelType(Enum):
Expand Down Expand Up @@ -233,10 +234,10 @@ def apply_feature_map(
l,
h,
is_multivariate
) for batch in batches]
) for batch in tqdm(batches)]
else:
partial_phis = Parallel(n_jobs=-1)([
delayed(_run_batch)(kernel_type, batch, l) for batch in batches
delayed(_run_batch)(kernel_type, batch, l) for batch in tqdm(batches)
])
phi: np.ndarray = np.vstack(partial_phis)
return phi
Expand Down
120 changes: 75 additions & 45 deletions hisel/select.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# API
from typing import List, Optional, Union
from typing import List, Optional, Union, Tuple
from enum import Enum
from dataclasses import dataclass
import numpy as np
Expand Down Expand Up @@ -42,22 +42,33 @@ def ksgmi(
x: pd.DataFrame,
y: Union[pd.DataFrame, pd.Series],
threshold: float = .01,
):
) -> Tuple[List[str], pd.Series]:
x = _preprocess_datatypes(x)
y = _preprocess_datatypes(y)
discrete_features = x.dtypes == int
mix = x.values
if isinstance(y, pd.Series) or (isinstance(y, pd.DataFrame) and y.shape[1] == 1):
miy = np.squeeze(y.values)
miys = np.squeeze(y.values).reshape(-1, 1)
else:
miy = np.linalg.norm(y, axis=1)
compute_mi = mutual_info_classif if miy.dtype == int else mutual_info_regression
mis = compute_mi(mix, miy, discrete_features=discrete_features)
mis /= np.max(mis)
isrelevant = mis > threshold
relevant_features = np.arange(x.shape[1])[isrelevant]
print(f'ksg-mi preprocessing: {sum(isrelevant)} features are pre-selected')
return relevant_features, mis
miys = y.values
sel = set()
totmis = np.zeros(x.shape[1], dtype=float)
for j in range(miys.shape[1]):
miy = miys[:, j]
compute_mi = mutual_info_classif if miy.dtype == int else mutual_info_regression
mis = pd.Series(
compute_mi(mix, miy, discrete_features=discrete_features),
index=x.columns)
mis /= np.max(mis)
sel = sel.union(set(
set(mis.loc[mis > threshold].index)
))
totmis += mis
mutual_infos = pd.Series(totmis, index=x.columns)
relevant_features = list(sel)
print(
f'ksg-mi preprocessing: {len(relevant_features)} features are pre-selected')
return relevant_features, mutual_infos


class HSICSelector:
Expand All @@ -66,6 +77,7 @@ def __init__(self,
y: np.ndarray,
xfeattype: Optional[FeatureType] = None,
yfeattype: Optional[FeatureType] = None,
feature_names: Optional[List[str]] = None,
):
assert x.ndim == 2
assert y.ndim == 2
Expand Down Expand Up @@ -94,6 +106,11 @@ def __init__(self,
self.yfeattype = yfeattype
self.xkerneltype = KernelType.DELTA if xfeattype == FeatureType.DISCR else KernelType.RBF
self.ykerneltype = KernelType.DELTA if yfeattype == FeatureType.DISCR else KernelType.RBF
if feature_names is None:
self.feature_names = [f'f{f}' for f in range(x.shape[1])]
pass
else:
self.feature_names = feature_names

def lasso_path(self):
if not hasattr(self, 'lassopaths'):
Expand All @@ -109,7 +126,7 @@ def lasso_path(self):
paths.append(_p)
path = np.mean(np.vstack(paths), axis=0)
df: pd.DataFrame = pd.DataFrame(
path, columns=[f'f{f}' for f in range(path.shape[1])])
path, columns=self.feature_names)
return df

def projection_matrix(self,
Expand Down Expand Up @@ -208,31 +225,50 @@ def autoselect(self,
number_of_epochs: int = 1,
threshold: float = .01,
device: Optional[str] = None,
):
curve = self.regularization_curve(
batch_size=batch_size,
minibatch_size=minibatch_size,
number_of_epochs=number_of_epochs,
device=device,
lasso_path: Optional[pd.DataFrame] = None,
) -> List[str]:
if lasso_path is None:
curve = self.regularization_curve(
batch_size=batch_size,
minibatch_size=minibatch_size,
number_of_epochs=number_of_epochs,
device=device,
)
lasso_path = self.lasso_path()
return HSICSelector.select_from_lasso_path(lasso_path, threshold)

@staticmethod
def select_from_lasso_path(
lasso_path: pd.DataFrame,
threshold: float = .01,
) -> List[str]:
features = list(lasso_path.columns)
curve = np.cumsum(np.sort(lasso_path.iloc[-1, :])[::-1])
ordered_features = sorted(
features,
key=lambda a: lasso_path[a].values[-1],
reverse=True
)
betas = np.diff(curve, prepend=.0)
betas /= betas[0]
number_of_features = sum(betas > threshold)
return self.ordered_features[:number_of_features]
return ordered_features[:number_of_features]


@dataclass
class Selection:
preselection: np.ndarray
mis: np.ndarray
_innersel: np.ndarray
hsic_selection: np.ndarray
mi_ordered_features: np.ndarray
hsic_ordered_features: np.ndarray
preselection: List[str]
mis: pd.Series
hsic_selection: List[str]
mi_ordered_features: List[str]
hsic_ordered_features: List[str]
lassopaths: pd.DataFrame
regcurve: np.ndarray
features: List[str] = None

def select_from_lasso_path(self, threshold: float = 0.01):
return HSICSelector.select_from_lasso_path(self.lassopaths, threshold)


def select(
x: pd.DataFrame,
Expand All @@ -249,39 +285,33 @@ def select(
if use_preselection:
cols, mis = ksgmi(x, y, mi_threshold)
else:
cols = np.arange(d)
mis = np.zeros(d)
x_ = x.iloc[:, cols].values
cols = x.columns.tolist()
mis = pd.Series(np.zeros(d), index=cols)
x_ = x.loc[:, cols].values
y_ = y.values
if y_.ndim == 1:
y_ = y_.reshape(-1, 1)
selector = HSICSelector(x_, y_)
selector = HSICSelector(x_, y_, feature_names=cols)
innersel_ = selector.autoselect(
threshold=hsic_threshold,
batch_size=batch_size,
minibatch_size=minibatch_size,
number_of_epochs=number_of_epochs,
device=device
)
_innersel = np.array(innersel_)
print(f'HSIC has selected {len(innersel_)} features')
hsic_ordered_features = cols[selector.ordered_features]
mi_ordered_features = np.argsort(mis)[::-1]
hsic_selection = cols[_innersel]
paths = selector.lasso_path()
renamecols = {
fd: f"f{cols[int(fd.split('f')[1])]}"
for fd in paths.columns
}
paths.rename(
columns=renamecols,
inplace=True
hsic_ordered_features = list(
np.array(cols)[selector.ordered_features]
)
curve = np.cumsum(np.sort(paths.iloc[-1, :])[::-1])
features = list(x.columns[hsic_selection])
preselection: List[str] = cols
mi_ordered_features: List[str] = list(
mis.sort_values(ascending=False).index)
hsic_selection: List[str] = innersel_
paths: pd.DataFrame = selector.lasso_path()
curve: np.array = np.cumsum(np.sort(paths.iloc[-1, :])[::-1])
features: List[str] = hsic_selection
sel = Selection(
preselection=cols,
_innersel=_innersel,
preselection=preselection,
mis=mis,
hsic_selection=hsic_selection,
mi_ordered_features=mi_ordered_features,
Expand Down
Loading

0 comments on commit 22785d9

Please sign in to comment.