Merge pull request #20 from Tialo/main

Fixed handling of categorical features.
ThomasBury · May 15, 2023 · fbd355b · fbd355b
2 parents ca71fb2 + 083b196
commit fbd355b
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 72 deletions.
diff --git a/src/arfs/feature_selection/allrelevant.py b/src/arfs/feature_selection/allrelevant.py
@@ -337,18 +337,15 @@ def _fit(self, X_raw, y, sample_weight=None):
         # because the columns are dynamically created/rejected
         X = X_raw
 
-        X = np.nan_to_num(X)
-        y = np.nan_to_num(y)
+        X = X.apply(np.nan_to_num)
+        if not isinstance(y, pd.Series):
+            y = pd.Series(np.nan_to_num(y))
+        else:
+            y = y.apply(np.nan_to_num)
 
         # check input params
         self._check_params(X, y)
 
-        if not isinstance(X, np.ndarray):
-            X = self._validate_pandas_input(X)
-
-        if not isinstance(y, np.ndarray):
-            y = self._validate_pandas_input(y)
-
         if sample_weight is not None:
             if not isinstance(sample_weight, np.ndarray):
                 sample_weight = self._validate_pandas_input(sample_weight)
@@ -441,9 +438,6 @@ def _fit(self, X_raw, y, sample_weight=None):
         self.imp_real_hist = imp_history
         self.sha_max = imp_sha_max
 
-        if isinstance(X_raw, np.ndarray):
-            X_raw = pd.DataFrame(X_raw)
-
         # absolute ranking
         vimp_df = pd.DataFrame(self.imp_real_hist, columns=self.feature_names_in_)
         self.ranking_absolutes_ = list(
@@ -635,21 +629,6 @@ def _get_tree_num(self, n_feat):
         n_estimators = int(multi * f_repr)
         return n_estimators
 
-    def _get_shuffle(self, seq):
-        """private method, shuffle a sequence
-
-        Parameters
-        ----------
-        seq : np.array
-            the sequence to shuffle
-        Returns
-        -------
-        seq : np.array
-            the shufled sequence
-        """
-        self.random_state.shuffle(seq)
-        return seq
-
     def _add_shadows_get_imps(self, X, y, sample_weight, dec_reg):
         """Add a shuffled copy of the columns (shadows) and get the feature
         importance of the augmented data set
@@ -673,26 +652,27 @@ def _add_shadows_get_imps(self, X, y, sample_weight, dec_reg):
         """
         # find features that are tentative still
         x_cur_ind = np.where(dec_reg >= 0)[0]
-        x_cur = np.copy(X[:, x_cur_ind])
+        x_cur = X.iloc[:, x_cur_ind].copy()
         x_cur_w = x_cur.shape[1]
         # deep copy the matrix for the shadow matrix
-        x_sha = np.copy(x_cur)
+        x_sha = x_cur.copy()
         # make sure there's at least 5 columns in the shadow matrix for
         while x_sha.shape[1] < 5:
-            x_sha = np.hstack((x_sha, x_sha))
+            x_sha = pd.concat([x_sha, x_sha], axis=1)
         # shuffle xSha
-        x_sha = np.apply_along_axis(self._get_shuffle, 0, x_sha)
+        x_sha = x_sha.apply(self.random_state.permutation, axis=0)
+        x_sha.columns = [f"Shadow_{i}" for i in range(x_sha.shape[1])]
         # get importance of the merged matrix
         if self.importance == "shap":
             imp = _get_shap_imp(
-                self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight
+                self.estimator, pd.concat([x_cur, x_sha], axis=1), y, sample_weight
             )
         elif self.importance == "pimp":
             imp = _get_perm_imp(
-                self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight
+                self.estimator, pd.concat([x_cur, x_sha], axis=1), y, sample_weight
             )
         else:
-            imp = _get_imp(self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight)
+            imp = _get_imp(self.estimator, pd.concat([x_cur, x_sha], axis=1), y, sample_weight)
 
         # separate importances of real and shadow features
         imp_sha = imp[x_cur_w:]
@@ -943,7 +923,6 @@ def _split_fit_estimator(estimator, X, y, sample_weight=None, cat_feature=None):
     """
     if cat_feature is None:
         # detect, store and encode categorical predictors
-        X = pd.DataFrame(X)
         X, _, cat_idx = get_pandas_cat_codes(X)
     else:
         cat_idx = cat_feature
@@ -965,9 +944,6 @@ def _split_fit_estimator(estimator, X, y, sample_weight=None, cat_feature=None):
             X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42)
         w_tr, w_tt = None, None
 
-    X_tr = pd.DataFrame(X_tr)
-    X_tt = pd.DataFrame(X_tt)
-
     if check_if_tree_based(estimator):
         try:
             # handle cat features if supported by the fit method
@@ -1130,10 +1106,6 @@ def _get_imp(estimator, X, y, sample_weight=None, cat_feature=None):
     estimator = clone(estimator)
 
     try:
-        # handle categoricals
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
-
         if cat_feature is None:
             X, _, cat_idx = get_pandas_cat_codes(X)
         else:

diff --git a/src/arfs/utils.py b/src/arfs/utils.py
@@ -119,8 +119,11 @@ def get_pandas_cat_codes(X):
     obj_feat = dtypes_dic["cat"] + dtypes_dic["time"] + dtypes_dic["unk"]
 
     if obj_feat:
-        cat = X[obj_feat].stack().astype("str").astype("category").cat.codes.unstack()
-        X = pd.concat([X[X.columns.difference(obj_feat)], cat], axis=1)
+        for obj_column in obj_feat:
+            column = X[obj_column].astype("str").astype("category")
+            # performs label encoding
+            _, inverse = np.unique(column, return_inverse=True)
+            X[obj_column] = inverse
         cat_idx = [X.columns.get_loc(col) for col in obj_feat]
     else:
         obj_feat = None

diff --git a/tests/test_allrelevant.py b/tests/test_allrelevant.py
@@ -3,8 +3,8 @@
 import lightgbm as lgb
 from arfs.feature_selection.allrelevant import Leshy, BoostAGroota, GrootCV
 from arfs.utils import (
-    generated_corr_dataset_regr,
-    generated_corr_dataset_classification,
+    _generated_corr_dataset_regr,
+    _generated_corr_dataset_classification,
 )
 from arfs.utils import LightForestClassifier, LightForestRegressor
 
@@ -26,13 +26,13 @@ def test_borutaPy_vs_leshy_with_rfc_and_native_feature_importance(self):
 
         # lightGBM random forest implementation
         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
-        X, y, w = generated_corr_dataset_classification(size=100)
+        X, y, w = _generated_corr_dataset_classification(size=100)
         n_feat = X.shape[1]
         rfc = LightForestClassifier(n_feat)
         # RandomForestClassifier(max_features='sqrt', max_samples=0.632, n_estimators=100) # --> too slow
         arfs = Leshy(rfc, verbose=0, max_iter=10, random_state=42, importance="native")
         arfs.fit(X, y)
-        leshy_rfc_list = sorted(arfs.support_names_)
+        leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_])
 
         # assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected"
         assert bool(
@@ -51,13 +51,13 @@ def test_borutaPy_vs_leshy_with_rfr_and_native_feature_importance(self):
 
         # lightGBM random forest implementation
         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
-        X, y, w = generated_corr_dataset_regr(size=100)
+        X, y, w = _generated_corr_dataset_regr(size=100)
         n_feat = X.shape[1]
         rfr = LightForestRegressor(n_feat)
         # rfr = RandomForestRegressor(max_features=0.3, max_samples=0.632, n_estimators=10)
         arfs = Leshy(rfr, verbose=0, max_iter=10, random_state=42, importance="native")
         arfs.fit(X, y)
-        leshy_rfc_list = sorted(arfs.support_names_)
+        leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_])
 
         # assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected"
         assert bool(
@@ -76,12 +76,12 @@ def test_borutaPy_vs_leshy_with_rfc_and_shap_feature_importance(self):
 
         # lightGBM random forest implementation
         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
-        X, y, w = generated_corr_dataset_classification(size=100)
+        X, y, w = _generated_corr_dataset_classification(size=100)
         n_feat = X.shape[1]
         model = LightForestClassifier(n_feat)
         arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap")
         arfs.fit(X, y)
-        leshy_rfc_list = sorted(arfs.support_names_)
+        leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_])
 
         # assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected"
         assert bool(
@@ -100,12 +100,12 @@ def test_borutaPy_vs_leshy_with_rfr_and_shap_feature_importance(self):
 
         # lightGBM random forest implementation
         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
-        X, y, w = generated_corr_dataset_regr(size=500)
+        X, y, w = _generated_corr_dataset_regr(size=500)
         n_feat = X.shape[1]
         model = LightForestRegressor(n_feat)
         arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap")
         arfs.fit(X, y)
-        leshy_rfc_list = sorted(arfs.support_names_)
+        leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_])
 
         # assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected"
         assert bool(
@@ -115,11 +115,11 @@ def test_borutaPy_vs_leshy_with_rfr_and_shap_feature_importance(self):
     def test_leshy_clf_with_lgb_and_shap_feature_importance_and_sample_weight(self):
         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
 
-        X, y, w = generated_corr_dataset_classification(size=500)
+        X, y, w = _generated_corr_dataset_classification(size=500)
         model = lgb.LGBMClassifier(verbose=-1, force_col_wise=True, n_estimators=10)
         arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap")
         arfs.fit(X, y, w)
-        leshy_list = sorted(arfs.support_names_)
+        leshy_list = sorted(arfs.feature_names_in_[arfs.support_])
 
         assert bool(
             set(baseline_list) & set(leshy_list)
@@ -128,11 +128,11 @@ def test_leshy_clf_with_lgb_and_shap_feature_importance_and_sample_weight(self):
     def test_leshy_regr_with_lgb_and_shap_feature_importance_and_sample_weight(self):
         baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"]
 
-        X, y, w = generated_corr_dataset_classification(size=500)
+        X, y, w = _generated_corr_dataset_classification(size=500)
         model = lgb.LGBMRegressor(verbose=-1, force_col_wise=True, n_estimators=10)
         arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap")
         arfs.fit(X, y, w)
-        leshy_list = sorted(arfs.support_names_)
+        leshy_list = sorted(arfs.feature_names_in_[arfs.support_])
 
         assert bool(
             set(baseline_list) & set(leshy_list)
@@ -149,7 +149,7 @@ def test_boostagroota_clf_with_lgb_and_shap_feature_importance_and_sample_weight
     ):
         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
 
-        X, y, w = generated_corr_dataset_classification(size=500)
+        X, y, w = _generated_corr_dataset_classification(size=500)
         model = lgb.LGBMClassifier(verbose=-1, force_col_wise=True, n_estimators=10)
         arfs = BoostAGroota(
             est=model,
@@ -161,7 +161,7 @@ def test_boostagroota_clf_with_lgb_and_shap_feature_importance_and_sample_weight
             importance="shap",
         )
         arfs.fit(X, y, w)
-        leshy_list = sorted(arfs.support_names_)
+        leshy_list = sorted(arfs.feature_names_in_[arfs.support_])
 
         assert bool(
             set(baseline_list) & set(leshy_list)
@@ -172,7 +172,7 @@ def test_boostagroota_clf_with_lgb_and_pimp_feature_importance_and_sample_weight
     ):
         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
 
-        X, y, w = generated_corr_dataset_classification(size=500)
+        X, y, w = _generated_corr_dataset_classification(size=500)
         model = lgb.LGBMClassifier(verbose=-1, force_col_wise=True, n_estimators=10)
         arfs = BoostAGroota(
             est=model,
@@ -184,7 +184,7 @@ def test_boostagroota_clf_with_lgb_and_pimp_feature_importance_and_sample_weight
             importance="pimp",
         )
         arfs.fit(X, y, w)
-        leshy_list = sorted(arfs.support_names_)
+        leshy_list = sorted(arfs.feature_names_in_[arfs.support_])
 
         assert bool(
             set(baseline_list) & set(leshy_list)
@@ -195,7 +195,7 @@ def test_boostagroota_rgr_with_lgb_and_shap_feature_importance_and_sample_weight
     ):
         baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"]
 
-        X, y, w = generated_corr_dataset_regr(size=500)
+        X, y, w = _generated_corr_dataset_regr(size=500)
         model = lgb.LGBMRegressor(verbose=-1, force_col_wise=True, n_estimators=10)
         arfs = BoostAGroota(
             est=model,
@@ -207,7 +207,7 @@ def test_boostagroota_rgr_with_lgb_and_shap_feature_importance_and_sample_weight
             importance="shap",
         )
         arfs.fit(X, y, w)
-        leshy_list = sorted(arfs.support_names_)
+        leshy_list = sorted(arfs.feature_names_in_[arfs.support_])
 
         assert bool(
             set(baseline_list) & set(leshy_list)
@@ -218,7 +218,7 @@ def test_boostagroota_regr_with_lgb_and_pimp_feature_importance_and_sample_weigh
     ):
         baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"]
 
-        X, y, w = generated_corr_dataset_regr(size=500)
+        X, y, w = _generated_corr_dataset_regr(size=500)
         model = lgb.LGBMRegressor(verbose=-1, force_col_wise=True, n_estimators=10)
         arfs = BoostAGroota(
             est=model,
@@ -230,7 +230,7 @@ def test_boostagroota_regr_with_lgb_and_pimp_feature_importance_and_sample_weigh
             importance="pimp",
         )
         arfs.fit(X, y, w)
-        leshy_list = sorted(arfs.support_names_)
+        leshy_list = sorted(arfs.feature_names_in_[arfs.support_])
 
         assert bool(
             set(baseline_list) & set(leshy_list)
@@ -245,10 +245,10 @@ class TestGrootCV:
     def test_grootcv_classification_with_and_sample_weight(self):
         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
 
-        X, y, w = generated_corr_dataset_classification(size=100)
+        X, y, w = _generated_corr_dataset_classification(size=100)
         arfs = GrootCV(objective="binary", cutoff=1, n_folds=3, n_iter=3, silent=False)
         arfs.fit(X, y, w)
-        grootcv_list = sorted(arfs.support_names_)
+        grootcv_list = sorted(arfs.feature_names_in_[arfs.support_])
 
         assert bool(
             set(baseline_list) & set(grootcv_list)
@@ -257,10 +257,10 @@ def test_grootcv_classification_with_and_sample_weight(self):
     def test_grootcv_regression_with_and_sample_weight(self):
         baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"]
 
-        X, y, w = generated_corr_dataset_regr(size=100)
+        X, y, w = _generated_corr_dataset_regr(size=100)
         arfs = GrootCV(objective="l2", cutoff=1, n_folds=3, n_iter=3, silent=False)
         arfs.fit(X, y, w)
-        grootcv_list = sorted(arfs.support_names_)
+        grootcv_list = sorted(arfs.feature_names_in_[arfs.support_])
 
         assert bool(
             set(baseline_list) & set(grootcv_list)

diff --git a/tests/test_featselect.py b/tests/test_featselect.py
@@ -37,7 +37,7 @@ class TestFeatSelectZeroVariance:
     def test_identify_single_unique_classification(self):
         # not task dependent (same for clf and regr)
         X, y, w = _generated_corr_dataset_classification(size=10)
-        fs = UniqueValuesThreshold(threshold=1)
+        fs = UniqueValuesThreshold(threshold=2)
         fs.fit(X)
         message = "Expected: {0}, Actual: {1}".format(
             "var10", fs.not_selected_features_
@@ -55,10 +55,12 @@ def test_identify_high_cardinality_classification(self):
         X, y, w = _generated_corr_dataset_classification(size=100)
         fs = CardinalityThreshold(threshold=5)
         fs.fit(X)
+        expected = sorted(["dummy", "nice_guys"])
+        actual = sorted(list(fs.not_selected_features_))
         message = "Expected: {0}, Actual: {1}".format(
-            "emb_dummy", fs.not_selected_features_
+            expected, actual
         )
-        assert fs.not_selected_features_ == ["emb_dummy"], message
+        assert actual == expected, message
 
 
 # class TestFeatSelectCollinearity: