Skip to content

Commit

Permalink
[python-package] Add feature_names_in_ attribute for scikit-learn e…
Browse files Browse the repository at this point in the history
…stimators (fixes #6279) (#6310)
  • Loading branch information
nicklamiller authored Jul 3, 2024
1 parent 7d9106d commit f811c82
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 1 deletion.
14 changes: 13 additions & 1 deletion python-package/lightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -1144,11 +1144,23 @@ def feature_importances_(self) -> np.ndarray:

@property
def feature_name_(self) -> List[str]:
""":obj:`list` of shape = [n_features]: The names of features."""
""":obj:`list` of shape = [n_features]: The names of features.
.. note::
If input does not contain feature names, they will be added during fitting in the format ``Column_0``, ``Column_1``, ..., ``Column_N``.
"""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError("No feature_name found. Need to call fit beforehand.")
return self._Booster.feature_name() # type: ignore[union-attr]

@property
def feature_names_in_(self) -> np.ndarray:
""":obj:`array` of shape = [n_features]: scikit-learn compatible version of ``.feature_name_``."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.")
return np.array(self.feature_name_)


class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
"""LightGBM regressor."""
Expand Down
40 changes: 40 additions & 0 deletions tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -1290,6 +1290,46 @@ def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth):
assert "Provided parameters constrain tree depth" not in capsys.readouterr().out


@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
def test_getting_feature_names_in_np_input(estimator_class):
# input is a numpy array, which doesn't have feature names. LightGBM adds
# feature names to the fitted model, which is inconsistent with sklearn's behavior
X, y = load_digits(n_class=2, return_X_y=True)
params = {"n_estimators": 2, "num_leaves": 7}
if estimator_class is lgb.LGBMModel:
model = estimator_class(**{**params, "objective": "binary"})
else:
model = estimator_class(**params)
with pytest.raises(lgb.compat.LGBMNotFittedError):
check_is_fitted(model)
if isinstance(model, lgb.LGBMRanker):
model.fit(X, y, group=[X.shape[0]])
else:
model.fit(X, y)
np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])]))


@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
def test_getting_feature_names_in_pd_input(estimator_class):
X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
col_names = X.columns.to_list()
assert isinstance(col_names, list) and all(
isinstance(c, str) for c in col_names
), "input data must have feature names for this test to cover the expected functionality"
params = {"n_estimators": 2, "num_leaves": 7}
if estimator_class is lgb.LGBMModel:
model = estimator_class(**{**params, "objective": "binary"})
else:
model = estimator_class(**params)
with pytest.raises(lgb.compat.LGBMNotFittedError):
check_is_fitted(model)
if isinstance(model, lgb.LGBMRanker):
model.fit(X, y, group=[X.shape[0]])
else:
model.fit(X, y)
np.testing.assert_array_equal(model.feature_names_in_, X.columns)


@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()])
def test_sklearn_integration(estimator, check):
estimator.set_params(min_child_samples=1, min_data_in_bin=1)
Expand Down

0 comments on commit f811c82

Please sign in to comment.