Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python][scikit-learn] new stacking tests and make number of features a property #3173

Merged
merged 19 commits into from
Jun 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions python-package/lightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,11 +253,6 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,

\*\*kwargs is not supported in sklearn, it may cause unexpected issues.

Attributes
----------
n_features_in_ : int
The number of features of fitted model.

Note
----
A custom objective function can be provided for the ``objective`` parameter.
Expand Down Expand Up @@ -313,6 +308,7 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
self._class_weight = None
self._class_map = None
self._n_features = None
self._n_features_in = None
self._classes = None
self._n_classes = None
self.set_params(**kwargs)
Expand Down Expand Up @@ -545,8 +541,8 @@ def fit(self, X, y,
sample_weight = np.multiply(sample_weight, class_sample_weight)

self._n_features = _X.shape[1]
# set public attribute for consistency
self.n_features_in_ = self._n_features
# copy for consistency
self._n_features_in = self._n_features

def _construct_dataset(X, y, sample_weight, init_score, group, params,
categorical_feature='auto'):
Expand Down Expand Up @@ -675,6 +671,13 @@ def n_features_(self):
raise LGBMNotFittedError('No n_features found. Need to call fit beforehand.')
return self._n_features

@property
def n_features_in_(self):
""":obj:`int`: The number of features of fitted model."""
if self._n_features_in is None:
raise LGBMNotFittedError('No n_features_in found. Need to call fit beforehand.')
return self._n_features_in

@property
def best_score_(self):
""":obj:`dict` or :obj:`None`: The best score of fitted model."""
Expand Down
50 changes: 50 additions & 0 deletions tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,56 @@ def test_dart(self):
self.assertGreaterEqual(score, 0.8)
self.assertLessEqual(score, 1.)

# sklearn <0.23 does not have a stacking classifier and n_features_in_ property
@unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23')
def test_stacking_classifier(self):
from sklearn.ensemble import StackingClassifier

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
classifiers = [('gbm1', lgb.LGBMClassifier(n_estimators=3)),
('gbm2', lgb.LGBMClassifier(n_estimators=3))]
clf = StackingClassifier(estimators=classifiers,
final_estimator=lgb.LGBMClassifier(n_estimators=3),
passthrough=True)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
self.assertGreaterEqual(score, 0.8)
self.assertLessEqual(score, 1.)
self.assertEqual(clf.n_features_in_, 4) # number of input features
self.assertEqual(len(clf.named_estimators_['gbm1'].feature_importances_), 4)
self.assertEqual(clf.named_estimators_['gbm1'].n_features_in_,
clf.named_estimators_['gbm2'].n_features_in_)
self.assertEqual(clf.final_estimator_.n_features_in_, 10) # number of concatenated features
self.assertEqual(len(clf.final_estimator_.feature_importances_), 10)
classes = clf.named_estimators_['gbm1'].classes_ == clf.named_estimators_['gbm2'].classes_
self.assertTrue(all(classes))
classes = clf.classes_ == clf.named_estimators_['gbm1'].classes_
self.assertTrue(all(classes))

# sklearn <0.23 does not have a stacking regressor and n_features_in_ property
@unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23')
def test_stacking_regressor(self):
from sklearn.ensemble import StackingRegressor

X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
regressors = [('gbm1', lgb.LGBMRegressor(n_estimators=3)),
('gbm2', lgb.LGBMRegressor(n_estimators=3))]
reg = StackingRegressor(estimators=regressors,
final_estimator=lgb.LGBMRegressor(n_estimators=3),
passthrough=True)
reg.fit(X_train, y_train)
score = reg.score(X_test, y_test)
self.assertGreaterEqual(score, 0.2)
self.assertLessEqual(score, 1.)
self.assertEqual(reg.n_features_in_, 13) # number of input features
self.assertEqual(len(reg.named_estimators_['gbm1'].feature_importances_), 13)
self.assertEqual(reg.named_estimators_['gbm1'].n_features_in_,
reg.named_estimators_['gbm2'].n_features_in_)
self.assertEqual(reg.final_estimator_.n_features_in_, 15) # number of concatenated features
self.assertEqual(len(reg.final_estimator_.feature_importances_), 15)

def test_grid_search(self):
X, y = load_iris(True)
y = np.array(list(map(str, y))) # utilize label encoder at it's max power
Expand Down