Skip to content

Commit

Permalink
Merge pull request #20 from Tialo/main
Browse files Browse the repository at this point in the history
Fixed handling of categorical features.
  • Loading branch information
ThomasBury authored May 15, 2023
2 parents ca71fb2 + 083b196 commit fbd355b
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 72 deletions.
54 changes: 13 additions & 41 deletions src/arfs/feature_selection/allrelevant.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,18 +337,15 @@ def _fit(self, X_raw, y, sample_weight=None):
# because the columns are dynamically created/rejected
X = X_raw

X = np.nan_to_num(X)
y = np.nan_to_num(y)
X = X.apply(np.nan_to_num)
if not isinstance(y, pd.Series):
y = pd.Series(np.nan_to_num(y))
else:
y = y.apply(np.nan_to_num)

# check input params
self._check_params(X, y)

if not isinstance(X, np.ndarray):
X = self._validate_pandas_input(X)

if not isinstance(y, np.ndarray):
y = self._validate_pandas_input(y)

if sample_weight is not None:
if not isinstance(sample_weight, np.ndarray):
sample_weight = self._validate_pandas_input(sample_weight)
Expand Down Expand Up @@ -441,9 +438,6 @@ def _fit(self, X_raw, y, sample_weight=None):
self.imp_real_hist = imp_history
self.sha_max = imp_sha_max

if isinstance(X_raw, np.ndarray):
X_raw = pd.DataFrame(X_raw)

# absolute ranking
vimp_df = pd.DataFrame(self.imp_real_hist, columns=self.feature_names_in_)
self.ranking_absolutes_ = list(
Expand Down Expand Up @@ -635,21 +629,6 @@ def _get_tree_num(self, n_feat):
n_estimators = int(multi * f_repr)
return n_estimators

def _get_shuffle(self, seq):
"""private method, shuffle a sequence
Parameters
----------
seq : np.array
the sequence to shuffle
Returns
-------
seq : np.array
the shufled sequence
"""
self.random_state.shuffle(seq)
return seq

def _add_shadows_get_imps(self, X, y, sample_weight, dec_reg):
"""Add a shuffled copy of the columns (shadows) and get the feature
importance of the augmented data set
Expand All @@ -673,26 +652,27 @@ def _add_shadows_get_imps(self, X, y, sample_weight, dec_reg):
"""
# find features that are tentative still
x_cur_ind = np.where(dec_reg >= 0)[0]
x_cur = np.copy(X[:, x_cur_ind])
x_cur = X.iloc[:, x_cur_ind].copy()
x_cur_w = x_cur.shape[1]
# deep copy the matrix for the shadow matrix
x_sha = np.copy(x_cur)
x_sha = x_cur.copy()
# make sure there's at least 5 columns in the shadow matrix for
while x_sha.shape[1] < 5:
x_sha = np.hstack((x_sha, x_sha))
x_sha = pd.concat([x_sha, x_sha], axis=1)
# shuffle xSha
x_sha = np.apply_along_axis(self._get_shuffle, 0, x_sha)
x_sha = x_sha.apply(self.random_state.permutation, axis=0)
x_sha.columns = [f"Shadow_{i}" for i in range(x_sha.shape[1])]
# get importance of the merged matrix
if self.importance == "shap":
imp = _get_shap_imp(
self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight
self.estimator, pd.concat([x_cur, x_sha], axis=1), y, sample_weight
)
elif self.importance == "pimp":
imp = _get_perm_imp(
self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight
self.estimator, pd.concat([x_cur, x_sha], axis=1), y, sample_weight
)
else:
imp = _get_imp(self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight)
imp = _get_imp(self.estimator, pd.concat([x_cur, x_sha], axis=1), y, sample_weight)

# separate importances of real and shadow features
imp_sha = imp[x_cur_w:]
Expand Down Expand Up @@ -943,7 +923,6 @@ def _split_fit_estimator(estimator, X, y, sample_weight=None, cat_feature=None):
"""
if cat_feature is None:
# detect, store and encode categorical predictors
X = pd.DataFrame(X)
X, _, cat_idx = get_pandas_cat_codes(X)
else:
cat_idx = cat_feature
Expand All @@ -965,9 +944,6 @@ def _split_fit_estimator(estimator, X, y, sample_weight=None, cat_feature=None):
X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42)
w_tr, w_tt = None, None

X_tr = pd.DataFrame(X_tr)
X_tt = pd.DataFrame(X_tt)

if check_if_tree_based(estimator):
try:
# handle cat features if supported by the fit method
Expand Down Expand Up @@ -1130,10 +1106,6 @@ def _get_imp(estimator, X, y, sample_weight=None, cat_feature=None):
estimator = clone(estimator)

try:
# handle categoricals
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

if cat_feature is None:
X, _, cat_idx = get_pandas_cat_codes(X)
else:
Expand Down
7 changes: 5 additions & 2 deletions src/arfs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,11 @@ def get_pandas_cat_codes(X):
obj_feat = dtypes_dic["cat"] + dtypes_dic["time"] + dtypes_dic["unk"]

if obj_feat:
cat = X[obj_feat].stack().astype("str").astype("category").cat.codes.unstack()
X = pd.concat([X[X.columns.difference(obj_feat)], cat], axis=1)
for obj_column in obj_feat:
column = X[obj_column].astype("str").astype("category")
# performs label encoding
_, inverse = np.unique(column, return_inverse=True)
X[obj_column] = inverse
cat_idx = [X.columns.get_loc(col) for col in obj_feat]
else:
obj_feat = None
Expand Down
52 changes: 26 additions & 26 deletions tests/test_allrelevant.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import lightgbm as lgb
from arfs.feature_selection.allrelevant import Leshy, BoostAGroota, GrootCV
from arfs.utils import (
generated_corr_dataset_regr,
generated_corr_dataset_classification,
_generated_corr_dataset_regr,
_generated_corr_dataset_classification,
)
from arfs.utils import LightForestClassifier, LightForestRegressor

Expand All @@ -26,13 +26,13 @@ def test_borutaPy_vs_leshy_with_rfc_and_native_feature_importance(self):

# lightGBM random forest implementation
baseline_list = ["var0", "var1", "var2", "var3", "var4"]
X, y, w = generated_corr_dataset_classification(size=100)
X, y, w = _generated_corr_dataset_classification(size=100)
n_feat = X.shape[1]
rfc = LightForestClassifier(n_feat)
# RandomForestClassifier(max_features='sqrt', max_samples=0.632, n_estimators=100) # --> too slow
arfs = Leshy(rfc, verbose=0, max_iter=10, random_state=42, importance="native")
arfs.fit(X, y)
leshy_rfc_list = sorted(arfs.support_names_)
leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_])

# assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected"
assert bool(
Expand All @@ -51,13 +51,13 @@ def test_borutaPy_vs_leshy_with_rfr_and_native_feature_importance(self):

# lightGBM random forest implementation
baseline_list = ["var0", "var1", "var2", "var3", "var4"]
X, y, w = generated_corr_dataset_regr(size=100)
X, y, w = _generated_corr_dataset_regr(size=100)
n_feat = X.shape[1]
rfr = LightForestRegressor(n_feat)
# rfr = RandomForestRegressor(max_features=0.3, max_samples=0.632, n_estimators=10)
arfs = Leshy(rfr, verbose=0, max_iter=10, random_state=42, importance="native")
arfs.fit(X, y)
leshy_rfc_list = sorted(arfs.support_names_)
leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_])

# assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected"
assert bool(
Expand All @@ -76,12 +76,12 @@ def test_borutaPy_vs_leshy_with_rfc_and_shap_feature_importance(self):

# lightGBM random forest implementation
baseline_list = ["var0", "var1", "var2", "var3", "var4"]
X, y, w = generated_corr_dataset_classification(size=100)
X, y, w = _generated_corr_dataset_classification(size=100)
n_feat = X.shape[1]
model = LightForestClassifier(n_feat)
arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap")
arfs.fit(X, y)
leshy_rfc_list = sorted(arfs.support_names_)
leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_])

# assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected"
assert bool(
Expand All @@ -100,12 +100,12 @@ def test_borutaPy_vs_leshy_with_rfr_and_shap_feature_importance(self):

# lightGBM random forest implementation
baseline_list = ["var0", "var1", "var2", "var3", "var4"]
X, y, w = generated_corr_dataset_regr(size=500)
X, y, w = _generated_corr_dataset_regr(size=500)
n_feat = X.shape[1]
model = LightForestRegressor(n_feat)
arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap")
arfs.fit(X, y)
leshy_rfc_list = sorted(arfs.support_names_)
leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_])

# assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected"
assert bool(
Expand All @@ -115,11 +115,11 @@ def test_borutaPy_vs_leshy_with_rfr_and_shap_feature_importance(self):
def test_leshy_clf_with_lgb_and_shap_feature_importance_and_sample_weight(self):
baseline_list = ["var0", "var1", "var2", "var3", "var4"]

X, y, w = generated_corr_dataset_classification(size=500)
X, y, w = _generated_corr_dataset_classification(size=500)
model = lgb.LGBMClassifier(verbose=-1, force_col_wise=True, n_estimators=10)
arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap")
arfs.fit(X, y, w)
leshy_list = sorted(arfs.support_names_)
leshy_list = sorted(arfs.feature_names_in_[arfs.support_])

assert bool(
set(baseline_list) & set(leshy_list)
Expand All @@ -128,11 +128,11 @@ def test_leshy_clf_with_lgb_and_shap_feature_importance_and_sample_weight(self):
def test_leshy_regr_with_lgb_and_shap_feature_importance_and_sample_weight(self):
baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"]

X, y, w = generated_corr_dataset_classification(size=500)
X, y, w = _generated_corr_dataset_classification(size=500)
model = lgb.LGBMRegressor(verbose=-1, force_col_wise=True, n_estimators=10)
arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap")
arfs.fit(X, y, w)
leshy_list = sorted(arfs.support_names_)
leshy_list = sorted(arfs.feature_names_in_[arfs.support_])

assert bool(
set(baseline_list) & set(leshy_list)
Expand All @@ -149,7 +149,7 @@ def test_boostagroota_clf_with_lgb_and_shap_feature_importance_and_sample_weight
):
baseline_list = ["var0", "var1", "var2", "var3", "var4"]

X, y, w = generated_corr_dataset_classification(size=500)
X, y, w = _generated_corr_dataset_classification(size=500)
model = lgb.LGBMClassifier(verbose=-1, force_col_wise=True, n_estimators=10)
arfs = BoostAGroota(
est=model,
Expand All @@ -161,7 +161,7 @@ def test_boostagroota_clf_with_lgb_and_shap_feature_importance_and_sample_weight
importance="shap",
)
arfs.fit(X, y, w)
leshy_list = sorted(arfs.support_names_)
leshy_list = sorted(arfs.feature_names_in_[arfs.support_])

assert bool(
set(baseline_list) & set(leshy_list)
Expand All @@ -172,7 +172,7 @@ def test_boostagroota_clf_with_lgb_and_pimp_feature_importance_and_sample_weight
):
baseline_list = ["var0", "var1", "var2", "var3", "var4"]

X, y, w = generated_corr_dataset_classification(size=500)
X, y, w = _generated_corr_dataset_classification(size=500)
model = lgb.LGBMClassifier(verbose=-1, force_col_wise=True, n_estimators=10)
arfs = BoostAGroota(
est=model,
Expand All @@ -184,7 +184,7 @@ def test_boostagroota_clf_with_lgb_and_pimp_feature_importance_and_sample_weight
importance="pimp",
)
arfs.fit(X, y, w)
leshy_list = sorted(arfs.support_names_)
leshy_list = sorted(arfs.feature_names_in_[arfs.support_])

assert bool(
set(baseline_list) & set(leshy_list)
Expand All @@ -195,7 +195,7 @@ def test_boostagroota_rgr_with_lgb_and_shap_feature_importance_and_sample_weight
):
baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"]

X, y, w = generated_corr_dataset_regr(size=500)
X, y, w = _generated_corr_dataset_regr(size=500)
model = lgb.LGBMRegressor(verbose=-1, force_col_wise=True, n_estimators=10)
arfs = BoostAGroota(
est=model,
Expand All @@ -207,7 +207,7 @@ def test_boostagroota_rgr_with_lgb_and_shap_feature_importance_and_sample_weight
importance="shap",
)
arfs.fit(X, y, w)
leshy_list = sorted(arfs.support_names_)
leshy_list = sorted(arfs.feature_names_in_[arfs.support_])

assert bool(
set(baseline_list) & set(leshy_list)
Expand All @@ -218,7 +218,7 @@ def test_boostagroota_regr_with_lgb_and_pimp_feature_importance_and_sample_weigh
):
baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"]

X, y, w = generated_corr_dataset_regr(size=500)
X, y, w = _generated_corr_dataset_regr(size=500)
model = lgb.LGBMRegressor(verbose=-1, force_col_wise=True, n_estimators=10)
arfs = BoostAGroota(
est=model,
Expand All @@ -230,7 +230,7 @@ def test_boostagroota_regr_with_lgb_and_pimp_feature_importance_and_sample_weigh
importance="pimp",
)
arfs.fit(X, y, w)
leshy_list = sorted(arfs.support_names_)
leshy_list = sorted(arfs.feature_names_in_[arfs.support_])

assert bool(
set(baseline_list) & set(leshy_list)
Expand All @@ -245,10 +245,10 @@ class TestGrootCV:
def test_grootcv_classification_with_and_sample_weight(self):
baseline_list = ["var0", "var1", "var2", "var3", "var4"]

X, y, w = generated_corr_dataset_classification(size=100)
X, y, w = _generated_corr_dataset_classification(size=100)
arfs = GrootCV(objective="binary", cutoff=1, n_folds=3, n_iter=3, silent=False)
arfs.fit(X, y, w)
grootcv_list = sorted(arfs.support_names_)
grootcv_list = sorted(arfs.feature_names_in_[arfs.support_])

assert bool(
set(baseline_list) & set(grootcv_list)
Expand All @@ -257,10 +257,10 @@ def test_grootcv_classification_with_and_sample_weight(self):
def test_grootcv_regression_with_and_sample_weight(self):
baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"]

X, y, w = generated_corr_dataset_regr(size=100)
X, y, w = _generated_corr_dataset_regr(size=100)
arfs = GrootCV(objective="l2", cutoff=1, n_folds=3, n_iter=3, silent=False)
arfs.fit(X, y, w)
grootcv_list = sorted(arfs.support_names_)
grootcv_list = sorted(arfs.feature_names_in_[arfs.support_])

assert bool(
set(baseline_list) & set(grootcv_list)
Expand Down
8 changes: 5 additions & 3 deletions tests/test_featselect.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class TestFeatSelectZeroVariance:
def test_identify_single_unique_classification(self):
# not task dependent (same for clf and regr)
X, y, w = _generated_corr_dataset_classification(size=10)
fs = UniqueValuesThreshold(threshold=1)
fs = UniqueValuesThreshold(threshold=2)
fs.fit(X)
message = "Expected: {0}, Actual: {1}".format(
"var10", fs.not_selected_features_
Expand All @@ -55,10 +55,12 @@ def test_identify_high_cardinality_classification(self):
X, y, w = _generated_corr_dataset_classification(size=100)
fs = CardinalityThreshold(threshold=5)
fs.fit(X)
expected = sorted(["dummy", "nice_guys"])
actual = sorted(list(fs.not_selected_features_))
message = "Expected: {0}, Actual: {1}".format(
"emb_dummy", fs.not_selected_features_
expected, actual
)
assert fs.not_selected_features_ == ["emb_dummy"], message
assert actual == expected, message


# class TestFeatSelectCollinearity:
Expand Down

0 comments on commit fbd355b

Please sign in to comment.