Pass pandas DataFrame to estimator when input is a pandas DataFrame

ClaudioSalvatoreArcidiacono · Jan 15, 2024 · 3ef9b40 · 3ef9b40
1 parent d6ee3f2
commit 3ef9b40
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 10 deletions.
diff --git a/felimination/rfe.py b/felimination/rfe.py
@@ -334,6 +334,16 @@ def __init__(
             importance_getter=importance_getter,
         )
 
+    @staticmethod
+    def _select_X_with_remaining_features(X, support, n_features):
+        features = np.arange(n_features)[support]
+        if isinstance(X, pd.DataFrame):
+            feature_names = X.columns[support]
+            X_remaining_features = X[feature_names]
+        else:
+            X_remaining_features = X[:, features]
+        return X_remaining_features, features
+
     def fit(self, X, y, groups=None, **fit_params):
         """Fit the RFE model and then the underlying estimator on the selected features.
 
@@ -354,7 +364,7 @@ def fit(self, X, y, groups=None, **fit_params):
         """
         self._validate_params()
         tags = self._get_tags()
-        X, y = self._validate_data(
+        self._validate_data(
             X,
             y,
             accept_sparse="csc",
@@ -385,8 +395,9 @@ def fit(self, X, y, groups=None, **fit_params):
         # Elimination
         while current_number_of_features > n_features_to_select:
             # Select remaining features
-            features = np.arange(n_features)[support_]
-            X_remaining_features = X[:, features]
+            X_remaining_features, features = self._select_X_with_remaining_features(
+                X, support=support_, n_features=n_features
+            )
 
             if self.verbose > 0:
                 print(
@@ -456,8 +467,10 @@ def fit(self, X, y, groups=None, **fit_params):
         # Set final attributes
 
         # Estimate performances of final model
-        features = np.arange(n_features)[support_]
-        X_remaining_features = X[:, features]
+        X_remaining_features, features = self._select_X_with_remaining_features(
+            X, support=support_, n_features=n_features
+        )
+
         cv_scores = cross_validate(
             self.estimator,
             X_remaining_features,
@@ -482,9 +495,12 @@ def fit(self, X, y, groups=None, **fit_params):
                 np.std(scores_per_fold)
             )
 
-        features = np.arange(n_features)[support_]
+        X_remaining_features, features = self._select_X_with_remaining_features(
+            X, support=support_, n_features=n_features
+        )
+
         self.estimator_ = clone(self.estimator)
-        self.estimator_.fit(X[:, features], y, **fit_params)
+        self.estimator_.fit(X_remaining_features, y, **fit_params)
 
         self.n_features_ = support_.sum()
         self.support_ = support_

diff --git a/tests/test_rfe.py b/tests/test_rfe.py
@@ -1,11 +1,14 @@
 import numpy as np
 import pandas as pd
 import pytest
-from matplotlib.testing.compare import compare_images
+from sklearn.compose import ColumnTransformer
 from sklearn.datasets import make_classification, make_friedman1
+from sklearn.exceptions import NotFittedError
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.model_selection import ShuffleSplit
-from sklearn.exceptions import NotFittedError
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
 from felimination.rfe import PermutationImportanceRFECV
 
 
@@ -133,8 +136,15 @@ def test_perm_imp_rfecv_classification_base_case_pandas(
     random_state,
 ):
     X_with_rand, y = x_y_classification_with_rand_columns_pandas
+    ct = ColumnTransformer(
+        [
+            ("scaler", StandardScaler(), ["x1"]),
+        ],
+        remainder="passthrough",
+    )
+    estimator = Pipeline([("ct", ct), ("lr", LogisticRegression(random_state=42))])
     selector = PermutationImportanceRFECV(
-        LogisticRegression(random_state=random_state),
+        estimator,
         cv=cv,
         n_features_to_select=n_useful_features_classification,
     )