From 7284946614e6a2d843795119527da6b0d6808422 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 28 Jun 2020 02:26:13 +0800 Subject: [PATCH] [python][scikit-learn] new stacking tests and make number of features a property (#3173) * modify attribute and include stacking tests * backwards compatibility * check sklearn version * move stacking import * Number of input features (#3173) * Number of input features (#3173) * Number of input features (#3173) * Number of input features (#3173) Split number of features and stacking tests. * Number of input features (#3173) Modify test name. * Number of input features (#3173) Update stacking tests for review comments. * Number of input features (#3173) * Number of input features (#3173) * Number of input features (#3173) * Number of input features (#3173) Modify classifier test. * Number of input features (#3173) * Number of input features (#3173) Check score. --- python-package/lightgbm/sklearn.py | 17 ++++---- tests/python_package_test/test_sklearn.py | 50 +++++++++++++++++++++++ 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 661eb3226efe..24cd96471220 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -253,11 +253,6 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1, \*\*kwargs is not supported in sklearn, it may cause unexpected issues. - Attributes - ---------- - n_features_in_ : int - The number of features of fitted model. - Note ---- A custom objective function can be provided for the ``objective`` parameter. @@ -313,6 +308,7 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1, self._class_weight = None self._class_map = None self._n_features = None + self._n_features_in = None self._classes = None self._n_classes = None self.set_params(**kwargs) @@ -545,8 +541,8 @@ def fit(self, X, y, sample_weight = np.multiply(sample_weight, class_sample_weight) self._n_features = _X.shape[1] - # set public attribute for consistency - self.n_features_in_ = self._n_features + # copy for consistency + self._n_features_in = self._n_features def _construct_dataset(X, y, sample_weight, init_score, group, params, categorical_feature='auto'): @@ -675,6 +671,13 @@ def n_features_(self): raise LGBMNotFittedError('No n_features found. Need to call fit beforehand.') return self._n_features + @property + def n_features_in_(self): + """:obj:`int`: The number of features of fitted model.""" + if self._n_features_in is None: + raise LGBMNotFittedError('No n_features_in found. Need to call fit beforehand.') + return self._n_features_in + @property def best_score_(self): """:obj:`dict` or :obj:`None`: The best score of fitted model.""" diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index cd50805a70b5..b47a5f0e32b8 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -163,6 +163,56 @@ def test_dart(self): self.assertGreaterEqual(score, 0.8) self.assertLessEqual(score, 1.) + # sklearn <0.23 does not have a stacking classifier and n_features_in_ property + @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') + def test_stacking_classifier(self): + from sklearn.ensemble import StackingClassifier + + X, y = load_iris(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + classifiers = [('gbm1', lgb.LGBMClassifier(n_estimators=3)), + ('gbm2', lgb.LGBMClassifier(n_estimators=3))] + clf = StackingClassifier(estimators=classifiers, + final_estimator=lgb.LGBMClassifier(n_estimators=3), + passthrough=True) + clf.fit(X_train, y_train) + score = clf.score(X_test, y_test) + self.assertGreaterEqual(score, 0.8) + self.assertLessEqual(score, 1.) + self.assertEqual(clf.n_features_in_, 4) # number of input features + self.assertEqual(len(clf.named_estimators_['gbm1'].feature_importances_), 4) + self.assertEqual(clf.named_estimators_['gbm1'].n_features_in_, + clf.named_estimators_['gbm2'].n_features_in_) + self.assertEqual(clf.final_estimator_.n_features_in_, 10) # number of concatenated features + self.assertEqual(len(clf.final_estimator_.feature_importances_), 10) + classes = clf.named_estimators_['gbm1'].classes_ == clf.named_estimators_['gbm2'].classes_ + self.assertTrue(all(classes)) + classes = clf.classes_ == clf.named_estimators_['gbm1'].classes_ + self.assertTrue(all(classes)) + + # sklearn <0.23 does not have a stacking regressor and n_features_in_ property + @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') + def test_stacking_regressor(self): + from sklearn.ensemble import StackingRegressor + + X, y = load_boston(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + regressors = [('gbm1', lgb.LGBMRegressor(n_estimators=3)), + ('gbm2', lgb.LGBMRegressor(n_estimators=3))] + reg = StackingRegressor(estimators=regressors, + final_estimator=lgb.LGBMRegressor(n_estimators=3), + passthrough=True) + reg.fit(X_train, y_train) + score = reg.score(X_test, y_test) + self.assertGreaterEqual(score, 0.2) + self.assertLessEqual(score, 1.) + self.assertEqual(reg.n_features_in_, 13) # number of input features + self.assertEqual(len(reg.named_estimators_['gbm1'].feature_importances_), 13) + self.assertEqual(reg.named_estimators_['gbm1'].n_features_in_, + reg.named_estimators_['gbm2'].n_features_in_) + self.assertEqual(reg.final_estimator_.n_features_in_, 15) # number of concatenated features + self.assertEqual(len(reg.final_estimator_.feature_importances_), 15) + def test_grid_search(self): X, y = load_iris(True) y = np.array(list(map(str, y))) # utilize label encoder at it's max power