From f811c827086d1f6e9f2fd05686033564aae25298 Mon Sep 17 00:00:00 2001 From: Nick Miller <58093869+nicklamiller@users.noreply.github.com> Date: Wed, 3 Jul 2024 16:13:47 -0700 Subject: [PATCH] [python-package] Add `feature_names_in_` attribute for scikit-learn estimators (fixes #6279) (#6310) --- python-package/lightgbm/sklearn.py | 14 +++++++- tests/python_package_test/test_sklearn.py | 40 +++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 8fb998984720..7f3e91a064c4 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1144,11 +1144,23 @@ def feature_importances_(self) -> np.ndarray: @property def feature_name_(self) -> List[str]: - """:obj:`list` of shape = [n_features]: The names of features.""" + """:obj:`list` of shape = [n_features]: The names of features. + + .. note:: + + If input does not contain feature names, they will be added during fitting in the format ``Column_0``, ``Column_1``, ..., ``Column_N``. + """ if not self.__sklearn_is_fitted__(): raise LGBMNotFittedError("No feature_name found. Need to call fit beforehand.") return self._Booster.feature_name() # type: ignore[union-attr] + @property + def feature_names_in_(self) -> np.ndarray: + """:obj:`array` of shape = [n_features]: scikit-learn compatible version of ``.feature_name_``.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.") + return np.array(self.feature_name_) + class LGBMRegressor(_LGBMRegressorBase, LGBMModel): """LightGBM regressor.""" diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index b458192a2ee0..10af8ba960f3 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1290,6 +1290,46 @@ def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth): assert "Provided parameters constrain tree depth" not in capsys.readouterr().out +@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) +def test_getting_feature_names_in_np_input(estimator_class): + # input is a numpy array, which doesn't have feature names. LightGBM adds + # feature names to the fitted model, which is inconsistent with sklearn's behavior + X, y = load_digits(n_class=2, return_X_y=True) + params = {"n_estimators": 2, "num_leaves": 7} + if estimator_class is lgb.LGBMModel: + model = estimator_class(**{**params, "objective": "binary"}) + else: + model = estimator_class(**params) + with pytest.raises(lgb.compat.LGBMNotFittedError): + check_is_fitted(model) + if isinstance(model, lgb.LGBMRanker): + model.fit(X, y, group=[X.shape[0]]) + else: + model.fit(X, y) + np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])])) + + +@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) +def test_getting_feature_names_in_pd_input(estimator_class): + X, y = load_digits(n_class=2, return_X_y=True, as_frame=True) + col_names = X.columns.to_list() + assert isinstance(col_names, list) and all( + isinstance(c, str) for c in col_names + ), "input data must have feature names for this test to cover the expected functionality" + params = {"n_estimators": 2, "num_leaves": 7} + if estimator_class is lgb.LGBMModel: + model = estimator_class(**{**params, "objective": "binary"}) + else: + model = estimator_class(**params) + with pytest.raises(lgb.compat.LGBMNotFittedError): + check_is_fitted(model) + if isinstance(model, lgb.LGBMRanker): + model.fit(X, y, group=[X.shape[0]]) + else: + model.fit(X, y) + np.testing.assert_array_equal(model.feature_names_in_, X.columns) + + @parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()]) def test_sklearn_integration(estimator, check): estimator.set_params(min_child_samples=1, min_data_in_bin=1)