From 390ad322e155dd5668cd0e008376acffbcad69fc Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Thu, 18 Jun 2020 17:47:44 +0800 Subject: [PATCH 01/16] modify attribute and include stacking tests --- python-package/lightgbm/sklearn.py | 12 ++++++++++-- tests/python_package_test/test_sklearn.py | 23 +++++++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index d86e510104e8..104fa61aadb4 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -333,6 +333,7 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1, self._class_weight = None self._class_map = None self._n_features = None + self._n_features_in = None self._classes = None self._n_classes = None self.set_params(**kwargs) @@ -565,8 +566,8 @@ def fit(self, X, y, sample_weight = np.multiply(sample_weight, class_sample_weight) self._n_features = _X.shape[1] - # set public attribute for consistency - self.n_features_in_ = self._n_features + # copy for consistency + self._n_features_in = self._n_features def _construct_dataset(X, y, sample_weight, init_score, group, params, categorical_feature='auto'): @@ -695,6 +696,13 @@ def n_features_(self): raise LGBMNotFittedError('No n_features found. Need to call fit beforehand.') return self._n_features + @property + def n_features_in_(self): + """Get the number of features of fitted model.""" + if self._n_features_in is None: + raise LGBMNotFittedError('No n_features_in found. Need to call fit beforehand.') + return self._n_features_in + @property def best_score_(self): """Get the best score of fitted model.""" diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index cd50805a70b5..6d0366a2f000 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -12,6 +12,7 @@ from sklearn.base import clone from sklearn.datasets import (load_boston, load_breast_cancer, load_digits, load_iris, load_svmlight_file) +from sklearn.ensemble import StackingClassifier, StackingRegressor from sklearn.exceptions import SkipTestWarning from sklearn.metrics import log_loss, mean_squared_error from sklearn.model_selection import GridSearchCV, train_test_split @@ -163,6 +164,28 @@ def test_dart(self): self.assertGreaterEqual(score, 0.8) self.assertLessEqual(score, 1.) + def test_stacking_classifier(self): + X, y = load_iris(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + classifiers = [('gbm1', lgb.LGBMClassifier()), + ('gbm2', lgb.LGBMClassifier())] + clf = StackingClassifier(estimators=classifiers, + final_estimator=lgb.LGBMClassifier()) + clf.fit(X_train, y_train).score(X_test, y_test) + # test number of input features + self.assertEqual(clf.n_features_in_, 4) + + def test_stacking_regressor(self): + X, y = load_boston(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + regressors = [('gbm1', lgb.LGBMRegressor()), + ('gbm2', lgb.LGBMRegressor())] + reg = StackingRegressor(estimators=regressors, + final_estimator=lgb.LGBMRegressor()) + reg.fit(X_train, y_train).score(X_test, y_test) + # test number of input features + self.assertEqual(reg.n_features_in_, 13) + def test_grid_search(self): X, y = load_iris(True) y = np.array(list(map(str, y))) # utilize label encoder at it's max power From 61ffd2573f4edf60802e589f0d3c61c475822609 Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Thu, 18 Jun 2020 18:21:35 +0800 Subject: [PATCH 02/16] backwards compatibility --- tests/python_package_test/test_sklearn.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 6d0366a2f000..67b2db3b1bcd 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -172,8 +172,9 @@ def test_stacking_classifier(self): clf = StackingClassifier(estimators=classifiers, final_estimator=lgb.LGBMClassifier()) clf.fit(X_train, y_train).score(X_test, y_test) - # test number of input features - self.assertEqual(clf.n_features_in_, 4) + if hasattr(self, 'n_features_in_'): + # test number of input features + self.assertEqual(clf.n_features_in_, 4) def test_stacking_regressor(self): X, y = load_boston(return_X_y=True) @@ -183,8 +184,9 @@ def test_stacking_regressor(self): reg = StackingRegressor(estimators=regressors, final_estimator=lgb.LGBMRegressor()) reg.fit(X_train, y_train).score(X_test, y_test) - # test number of input features - self.assertEqual(reg.n_features_in_, 13) + if hasattr(self, 'n_features_in_'): + # test number of input features + self.assertEqual(reg.n_features_in_, 13) def test_grid_search(self): X, y = load_iris(True) From fefb3f90a1a0942d6e8e2164be5a25889fc9369f Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Thu, 18 Jun 2020 18:46:54 +0800 Subject: [PATCH 03/16] check sklearn version --- tests/python_package_test/test_sklearn.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 67b2db3b1bcd..399babf4cb0b 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -164,6 +164,8 @@ def test_dart(self): self.assertGreaterEqual(score, 0.8) self.assertLessEqual(score, 1.) + # sklearn <0.22 does not have a stacking classifier + @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') def test_stacking_classifier(self): X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) @@ -176,6 +178,8 @@ def test_stacking_classifier(self): # test number of input features self.assertEqual(clf.n_features_in_, 4) + # sklearn <0.22 does not have a stacking regressor + @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') def test_stacking_regressor(self): X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) From 4f51ab7caab8c8fe148f6ce0df138eddd8a5bdb3 Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Thu, 18 Jun 2020 19:04:49 +0800 Subject: [PATCH 04/16] move stacking import --- tests/python_package_test/test_sklearn.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 399babf4cb0b..dfe7aff85614 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -12,7 +12,6 @@ from sklearn.base import clone from sklearn.datasets import (load_boston, load_breast_cancer, load_digits, load_iris, load_svmlight_file) -from sklearn.ensemble import StackingClassifier, StackingRegressor from sklearn.exceptions import SkipTestWarning from sklearn.metrics import log_loss, mean_squared_error from sklearn.model_selection import GridSearchCV, train_test_split @@ -167,6 +166,8 @@ def test_dart(self): # sklearn <0.22 does not have a stacking classifier @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') def test_stacking_classifier(self): + from sklearn.ensemble import StackingClassifier + X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) classifiers = [('gbm1', lgb.LGBMClassifier()), @@ -181,6 +182,8 @@ def test_stacking_classifier(self): # sklearn <0.22 does not have a stacking regressor @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') def test_stacking_regressor(self): + from sklearn.ensemble import StackingRegressor + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) regressors = [('gbm1', lgb.LGBMRegressor()), From 157b27fecf7cc7f51016d819124db401936a59c0 Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Fri, 26 Jun 2020 10:50:40 +0800 Subject: [PATCH 05/16] Number of input features (#3173) --- tests/python_package_test/test_sklearn.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index dfe7aff85614..b0f097f6e4c0 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -163,10 +163,11 @@ def test_dart(self): self.assertGreaterEqual(score, 0.8) self.assertLessEqual(score, 1.) - # sklearn <0.22 does not have a stacking classifier - @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') + # sklearn < 0.23 does not have a stacking classifier and n_features_in_ attribute + @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') def test_stacking_classifier(self): from sklearn.ensemble import StackingClassifier + from sklearn.utils.estimator_checks import check_n_features_in X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) @@ -175,14 +176,15 @@ def test_stacking_classifier(self): clf = StackingClassifier(estimators=classifiers, final_estimator=lgb.LGBMClassifier()) clf.fit(X_train, y_train).score(X_test, y_test) - if hasattr(self, 'n_features_in_'): - # test number of input features - self.assertEqual(clf.n_features_in_, 4) + self.assertEqual(clf.n_features_in_, 4) # test number of input features + name = clf.__class__.__name__ + check_n_features_in(name, clf) # test sklearn API compatibility - # sklearn <0.22 does not have a stacking regressor - @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') + # sklearn < 0.23 does not have a stacking regressor and n_features_in_ attribute + @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') def test_stacking_regressor(self): from sklearn.ensemble import StackingRegressor + from sklearn.utils.estimator_checks import check_n_features_in X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) @@ -191,9 +193,9 @@ def test_stacking_regressor(self): reg = StackingRegressor(estimators=regressors, final_estimator=lgb.LGBMRegressor()) reg.fit(X_train, y_train).score(X_test, y_test) - if hasattr(self, 'n_features_in_'): - # test number of input features - self.assertEqual(reg.n_features_in_, 13) + self.assertEqual(reg.n_features_in_, 13) # test number of input features + name = reg.__class__.__name__ + check_n_features_in(name, reg) # test sklearn API compatibility def test_grid_search(self): X, y = load_iris(True) From 7d8dafe307ec203b6e57fd0d5c68763394e3f0c0 Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Fri, 26 Jun 2020 11:06:57 +0800 Subject: [PATCH 06/16] Number of input features (#3173) --- python-package/lightgbm/sklearn.py | 12 ++++++-- tests/python_package_test/test_sklearn.py | 34 +++++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 661eb3226efe..7939d092bc4a 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -313,6 +313,7 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1, self._class_weight = None self._class_map = None self._n_features = None + self._n_features_in = None self._classes = None self._n_classes = None self.set_params(**kwargs) @@ -545,8 +546,8 @@ def fit(self, X, y, sample_weight = np.multiply(sample_weight, class_sample_weight) self._n_features = _X.shape[1] - # set public attribute for consistency - self.n_features_in_ = self._n_features + # copy for consistency + self._n_features_in = self._n_features def _construct_dataset(X, y, sample_weight, init_score, group, params, categorical_feature='auto'): @@ -675,6 +676,13 @@ def n_features_(self): raise LGBMNotFittedError('No n_features found. Need to call fit beforehand.') return self._n_features + @property + def n_features_in_(self): + """Get the number of features of fitted model.""" + if self._n_features_in is None: + raise LGBMNotFittedError('No n_features_in found. Need to call fit beforehand.') + return self._n_features_in + @property def best_score_(self): """:obj:`dict` or :obj:`None`: The best score of fitted model.""" diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index cd50805a70b5..b0f097f6e4c0 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -163,6 +163,40 @@ def test_dart(self): self.assertGreaterEqual(score, 0.8) self.assertLessEqual(score, 1.) + # sklearn < 0.23 does not have a stacking classifier and n_features_in_ attribute + @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') + def test_stacking_classifier(self): + from sklearn.ensemble import StackingClassifier + from sklearn.utils.estimator_checks import check_n_features_in + + X, y = load_iris(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + classifiers = [('gbm1', lgb.LGBMClassifier()), + ('gbm2', lgb.LGBMClassifier())] + clf = StackingClassifier(estimators=classifiers, + final_estimator=lgb.LGBMClassifier()) + clf.fit(X_train, y_train).score(X_test, y_test) + self.assertEqual(clf.n_features_in_, 4) # test number of input features + name = clf.__class__.__name__ + check_n_features_in(name, clf) # test sklearn API compatibility + + # sklearn < 0.23 does not have a stacking regressor and n_features_in_ attribute + @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') + def test_stacking_regressor(self): + from sklearn.ensemble import StackingRegressor + from sklearn.utils.estimator_checks import check_n_features_in + + X, y = load_boston(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + regressors = [('gbm1', lgb.LGBMRegressor()), + ('gbm2', lgb.LGBMRegressor())] + reg = StackingRegressor(estimators=regressors, + final_estimator=lgb.LGBMRegressor()) + reg.fit(X_train, y_train).score(X_test, y_test) + self.assertEqual(reg.n_features_in_, 13) # test number of input features + name = reg.__class__.__name__ + check_n_features_in(name, reg) # test sklearn API compatibility + def test_grid_search(self): X, y = load_iris(True) y = np.array(list(map(str, y))) # utilize label encoder at it's max power From 2e55ee04d2531cbdbe0c0f95e344837d9943157b Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Fri, 26 Jun 2020 11:52:35 +0800 Subject: [PATCH 07/16] Number of input features (#3173) --- python-package/lightgbm/sklearn.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 7939d092bc4a..24cd96471220 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -253,11 +253,6 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1, \*\*kwargs is not supported in sklearn, it may cause unexpected issues. - Attributes - ---------- - n_features_in_ : int - The number of features of fitted model. - Note ---- A custom objective function can be provided for the ``objective`` parameter. @@ -678,7 +673,7 @@ def n_features_(self): @property def n_features_in_(self): - """Get the number of features of fitted model.""" + """:obj:`int`: The number of features of fitted model.""" if self._n_features_in is None: raise LGBMNotFittedError('No n_features_in found. Need to call fit beforehand.') return self._n_features_in From 4592c58edd56d82cca8ad71c7d100c3628e0367c Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Fri, 26 Jun 2020 13:57:19 +0800 Subject: [PATCH 08/16] Number of input features (#3173) Split number of features and stacking tests. --- tests/python_package_test/test_sklearn.py | 32 +++++++++++++++-------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index b0f097f6e4c0..ccf902b060e3 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -163,11 +163,10 @@ def test_dart(self): self.assertGreaterEqual(score, 0.8) self.assertLessEqual(score, 1.) - # sklearn < 0.23 does not have a stacking classifier and n_features_in_ attribute - @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') + # sklearn < 0.22 does not have a stacking classifier + @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') def test_stacking_classifier(self): from sklearn.ensemble import StackingClassifier - from sklearn.utils.estimator_checks import check_n_features_in X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) @@ -176,15 +175,12 @@ def test_stacking_classifier(self): clf = StackingClassifier(estimators=classifiers, final_estimator=lgb.LGBMClassifier()) clf.fit(X_train, y_train).score(X_test, y_test) - self.assertEqual(clf.n_features_in_, 4) # test number of input features - name = clf.__class__.__name__ - check_n_features_in(name, clf) # test sklearn API compatibility + self.assertEqual(clf.n_features_in_, 4) # test number of input features - # sklearn < 0.23 does not have a stacking regressor and n_features_in_ attribute - @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') + # sklearn < 0.22 does not have a stacking regressor + @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') def test_stacking_regressor(self): from sklearn.ensemble import StackingRegressor - from sklearn.utils.estimator_checks import check_n_features_in X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) @@ -193,9 +189,23 @@ def test_stacking_regressor(self): reg = StackingRegressor(estimators=regressors, final_estimator=lgb.LGBMRegressor()) reg.fit(X_train, y_train).score(X_test, y_test) - self.assertEqual(reg.n_features_in_, 13) # test number of input features + self.assertEqual(reg.n_features_in_, 13) # test number of input features + + # sklearn < 0.23 does not have n_features_in_ attribute + @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') + def test_n_features_in_(): + from sklearn.utils.estimator_checks import check_n_features_in + + X, y = load_iris(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + clf = lgb.LGBMClassifier() + name = clf.__class__.__name__ + check_n_features_in(name, clf) # test classifier compatibility + X, y = load_boston(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + reg = lgb.LGBMRegressor() name = reg.__class__.__name__ - check_n_features_in(name, reg) # test sklearn API compatibility + check_n_features_in(name, reg) # test regressor compatibility def test_grid_search(self): X, y = load_iris(True) From a6c1de757675213da3aa14a078f41ee27ba477bd Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Fri, 26 Jun 2020 14:32:42 +0800 Subject: [PATCH 09/16] Number of input features (#3173) Modify test name. --- tests/python_package_test/test_sklearn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index ccf902b060e3..578172302993 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -191,9 +191,9 @@ def test_stacking_regressor(self): reg.fit(X_train, y_train).score(X_test, y_test) self.assertEqual(reg.n_features_in_, 13) # test number of input features - # sklearn < 0.23 does not have n_features_in_ attribute + # sklearn < 0.23 does not have n_features_in_ @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') - def test_n_features_in_(): + def test_n_features_in(self): from sklearn.utils.estimator_checks import check_n_features_in X, y = load_iris(return_X_y=True) From 9cf52742e459c6623683d4927456f6e729397022 Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Sat, 27 Jun 2020 12:03:48 +0800 Subject: [PATCH 10/16] Number of input features (#3173) Update stacking tests for review comments. --- tests/python_package_test/test_sklearn.py | 53 +++++++++++------------ 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 578172302993..a10d9b80d072 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -163,49 +163,48 @@ def test_dart(self): self.assertGreaterEqual(score, 0.8) self.assertLessEqual(score, 1.) - # sklearn < 0.22 does not have a stacking classifier + # sklearn <0.22 does not have a stacking classifier @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') def test_stacking_classifier(self): from sklearn.ensemble import StackingClassifier X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) - classifiers = [('gbm1', lgb.LGBMClassifier()), - ('gbm2', lgb.LGBMClassifier())] + classifiers = [('gbm1', lgb.LGBMClassifier(n_estimators=3)), + ('gbm2', lgb.LGBMClassifier(n_estimators=3))] clf = StackingClassifier(estimators=classifiers, - final_estimator=lgb.LGBMClassifier()) + final_estimator=lgb.LGBMClassifier(n_estimators=3), + passthrough=True) clf.fit(X_train, y_train).score(X_test, y_test) - self.assertEqual(clf.n_features_in_, 4) # test number of input features - - # sklearn < 0.22 does not have a stacking regressor + self.assertEqual(clf.n_features_in_, 4) # number of input features + self.assertEqual(len(clf.named_estimators_['gbm1'].feature_importances_), 4) + self.assertEqual(clf.named_estimators_['gbm1'].n_features_in_, + clf.named_estimators_['gbm2'].n_features_in_) + self.assertEqual(clf.final_estimator_.n_features_in_, 10) # number of concatenated features + self.assertEqual(len(clf.final_estimator_.feature_importances_), 10) + self.assertEqual(clf.classes_, np.array([0, 1, 2])) + self.assertEqual(clf.named_estimators_['gbm1'].classes_, + clf.named_estimators_['gbm2'].classes_) + + # sklearn <0.22 does not have a stacking regressor @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') def test_stacking_regressor(self): from sklearn.ensemble import StackingRegressor X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) - regressors = [('gbm1', lgb.LGBMRegressor()), - ('gbm2', lgb.LGBMRegressor())] + regressors = [('gbm1', lgb.LGBMRegressor(n_estimators=3)), + ('gbm2', lgb.LGBMRegressor(n_estimators=3))] reg = StackingRegressor(estimators=regressors, - final_estimator=lgb.LGBMRegressor()) + final_estimator=lgb.LGBMRegressor(n_estimators=3), + passthrough=True) reg.fit(X_train, y_train).score(X_test, y_test) - self.assertEqual(reg.n_features_in_, 13) # test number of input features - - # sklearn < 0.23 does not have n_features_in_ - @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') - def test_n_features_in(self): - from sklearn.utils.estimator_checks import check_n_features_in - - X, y = load_iris(return_X_y=True) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) - clf = lgb.LGBMClassifier() - name = clf.__class__.__name__ - check_n_features_in(name, clf) # test classifier compatibility - X, y = load_boston(return_X_y=True) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) - reg = lgb.LGBMRegressor() - name = reg.__class__.__name__ - check_n_features_in(name, reg) # test regressor compatibility + self.assertEqual(reg.n_features_in_, 13) # number of input features + self.assertEqual(len(reg.named_estimators_['gbm1'].feature_importances_), 13) + self.assertEqual(reg.named_estimators_['gbm1'].n_features_in_, + reg.named_estimators_['gbm2'].n_features_in_) + self.assertEqual(reg.final_estimator_.n_features_in_, 15) # number of concatenated features + self.assertEqual(len(reg.final_estimator_.feature_importances_), 15) def test_grid_search(self): X, y = load_iris(True) From 4d7b262206326757a9166124821eef40425909e5 Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Sat, 27 Jun 2020 12:29:12 +0800 Subject: [PATCH 11/16] Number of input features (#3173) --- tests/python_package_test/test_sklearn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index a10d9b80d072..b83d319d7f0a 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -182,7 +182,6 @@ def test_stacking_classifier(self): clf.named_estimators_['gbm2'].n_features_in_) self.assertEqual(clf.final_estimator_.n_features_in_, 10) # number of concatenated features self.assertEqual(len(clf.final_estimator_.feature_importances_), 10) - self.assertEqual(clf.classes_, np.array([0, 1, 2])) self.assertEqual(clf.named_estimators_['gbm1'].classes_, clf.named_estimators_['gbm2'].classes_) From d2312e19ed033966ca7fe8c1b892333b001e7e43 Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Sat, 27 Jun 2020 12:46:03 +0800 Subject: [PATCH 12/16] Number of input features (#3173) --- tests/python_package_test/test_sklearn.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index b83d319d7f0a..fb38c14bbfd4 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -163,8 +163,8 @@ def test_dart(self): self.assertGreaterEqual(score, 0.8) self.assertLessEqual(score, 1.) - # sklearn <0.22 does not have a stacking classifier - @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') + # sklearn <0.23 does not have a stacking classifier and n_features_in_ property + @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') def test_stacking_classifier(self): from sklearn.ensemble import StackingClassifier @@ -182,11 +182,12 @@ def test_stacking_classifier(self): clf.named_estimators_['gbm2'].n_features_in_) self.assertEqual(clf.final_estimator_.n_features_in_, 10) # number of concatenated features self.assertEqual(len(clf.final_estimator_.feature_importances_), 10) + self.assertEqual(clf.classes_, np.array([0, 1, 2])) self.assertEqual(clf.named_estimators_['gbm1'].classes_, clf.named_estimators_['gbm2'].classes_) - # sklearn <0.22 does not have a stacking regressor - @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') + # sklearn <0.23 does not have a stacking regressor and n_features_in_ property + @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') def test_stacking_regressor(self): from sklearn.ensemble import StackingRegressor From f2f5ff94617ff1ba76481091fc2fdd849b812bbb Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Sat, 27 Jun 2020 13:00:01 +0800 Subject: [PATCH 13/16] Number of input features (#3173) --- tests/python_package_test/test_sklearn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index fb38c14bbfd4..6cdc228b32c7 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -182,7 +182,6 @@ def test_stacking_classifier(self): clf.named_estimators_['gbm2'].n_features_in_) self.assertEqual(clf.final_estimator_.n_features_in_, 10) # number of concatenated features self.assertEqual(len(clf.final_estimator_.feature_importances_), 10) - self.assertEqual(clf.classes_, np.array([0, 1, 2])) self.assertEqual(clf.named_estimators_['gbm1'].classes_, clf.named_estimators_['gbm2'].classes_) From 568750a7b1541c8d26d8762aa9dd09b5ce1b5dba Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Sat, 27 Jun 2020 15:15:04 +0800 Subject: [PATCH 14/16] Number of input features (#3173) Modify classifier test. --- tests/python_package_test/test_sklearn.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 6cdc228b32c7..58eca3f2570a 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -182,8 +182,11 @@ def test_stacking_classifier(self): clf.named_estimators_['gbm2'].n_features_in_) self.assertEqual(clf.final_estimator_.n_features_in_, 10) # number of concatenated features self.assertEqual(len(clf.final_estimator_.feature_importances_), 10) - self.assertEqual(clf.named_estimators_['gbm1'].classes_, - clf.named_estimators_['gbm2'].classes_) + matching_classes = all(clf.named_estimators_['gbm1'].classes_ ==\ + clf.named_estimators_['gbm2'].classes_) + self.assertEqual(matching_classes, True) + matching_classes = all(clf.classes_ == clf.named_estimators_['gbm1'].classes_) + self.assertEqual(matching_classes, True) # sklearn <0.23 does not have a stacking regressor and n_features_in_ property @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') From 9384b89b5b71f0215666b61b01c6e62fd3cf987b Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Sat, 27 Jun 2020 15:54:38 +0800 Subject: [PATCH 15/16] Number of input features (#3173) --- tests/python_package_test/test_sklearn.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 58eca3f2570a..63b12568ce01 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -182,11 +182,10 @@ def test_stacking_classifier(self): clf.named_estimators_['gbm2'].n_features_in_) self.assertEqual(clf.final_estimator_.n_features_in_, 10) # number of concatenated features self.assertEqual(len(clf.final_estimator_.feature_importances_), 10) - matching_classes = all(clf.named_estimators_['gbm1'].classes_ ==\ - clf.named_estimators_['gbm2'].classes_) - self.assertEqual(matching_classes, True) - matching_classes = all(clf.classes_ == clf.named_estimators_['gbm1'].classes_) - self.assertEqual(matching_classes, True) + classes = clf.named_estimators_['gbm1'].classes_ == clf.named_estimators_['gbm2'].classes_ + self.assertEqual(all(classes), True) + classes = clf.classes_ == clf.named_estimators_['gbm1'].classes_ + self.assertEqual(all(classes), True) # sklearn <0.23 does not have a stacking regressor and n_features_in_ property @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') From ee3b9e3908685b0810aa5aa609427777e05de593 Mon Sep 17 00:00:00 2001 From: a-wozniakowski <wozn0001@e.ntu.edu.sg> Date: Sat, 27 Jun 2020 21:48:36 +0800 Subject: [PATCH 16/16] Number of input features (#3173) Check score. --- tests/python_package_test/test_sklearn.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 63b12568ce01..b47a5f0e32b8 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -175,7 +175,10 @@ def test_stacking_classifier(self): clf = StackingClassifier(estimators=classifiers, final_estimator=lgb.LGBMClassifier(n_estimators=3), passthrough=True) - clf.fit(X_train, y_train).score(X_test, y_test) + clf.fit(X_train, y_train) + score = clf.score(X_test, y_test) + self.assertGreaterEqual(score, 0.8) + self.assertLessEqual(score, 1.) self.assertEqual(clf.n_features_in_, 4) # number of input features self.assertEqual(len(clf.named_estimators_['gbm1'].feature_importances_), 4) self.assertEqual(clf.named_estimators_['gbm1'].n_features_in_, @@ -183,9 +186,9 @@ def test_stacking_classifier(self): self.assertEqual(clf.final_estimator_.n_features_in_, 10) # number of concatenated features self.assertEqual(len(clf.final_estimator_.feature_importances_), 10) classes = clf.named_estimators_['gbm1'].classes_ == clf.named_estimators_['gbm2'].classes_ - self.assertEqual(all(classes), True) + self.assertTrue(all(classes)) classes = clf.classes_ == clf.named_estimators_['gbm1'].classes_ - self.assertEqual(all(classes), True) + self.assertTrue(all(classes)) # sklearn <0.23 does not have a stacking regressor and n_features_in_ property @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') @@ -199,7 +202,10 @@ def test_stacking_regressor(self): reg = StackingRegressor(estimators=regressors, final_estimator=lgb.LGBMRegressor(n_estimators=3), passthrough=True) - reg.fit(X_train, y_train).score(X_test, y_test) + reg.fit(X_train, y_train) + score = reg.score(X_test, y_test) + self.assertGreaterEqual(score, 0.2) + self.assertLessEqual(score, 1.) self.assertEqual(reg.n_features_in_, 13) # number of input features self.assertEqual(len(reg.named_estimators_['gbm1'].feature_importances_), 13) self.assertEqual(reg.named_estimators_['gbm1'].n_features_in_,