From de83a69e65802bc64c280cdc55ec6503025b3a1f Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 20 Aug 2019 21:26:06 +1000 Subject: [PATCH] Change binning behavior to be same as PR #2342. --- src/io/bin.cpp | 14 +++++++---- tests/python_package_test/test_engine.py | 31 +++++++++++++++++++++--- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index b26a6a461e3e..40da30c6ad2d 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -186,7 +186,7 @@ namespace LightGBM { } } - // include zero bounds if possible + // include zero bounds and infinity bound if (max_bin == 2) { if (left_cnt == 0) { bin_upper_bound.push_back(kZeroThreshold); @@ -194,9 +194,14 @@ namespace LightGBM { bin_upper_bound.push_back(-kZeroThreshold); } } else if (max_bin >= 3) { - bin_upper_bound.push_back(-kZeroThreshold); - bin_upper_bound.push_back(kZeroThreshold); + if (left_cnt > 0) { + bin_upper_bound.push_back(-kZeroThreshold); + } + if (right_start >= 0) { + bin_upper_bound.push_back(kZeroThreshold); + } } + bin_upper_bound.push_back(std::numeric_limits::infinity()); // add forced bounds, excluding zeros since we have already added zero bounds int i = 0; @@ -207,7 +212,6 @@ namespace LightGBM { ++i; } } - bin_upper_bound.push_back(std::numeric_limits::infinity()); int max_to_insert = max_bin - static_cast(bin_upper_bound.size()); int num_to_insert = std::min(max_to_insert, static_cast(forced_upper_bounds.size())); if (num_to_insert > 0) { @@ -239,7 +243,7 @@ namespace LightGBM { } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); - CHECK(bin_upper_bound.size() <= max_bin); + CHECK(bin_upper_bound.size() <= static_cast(max_bin)); return bin_upper_bound; } diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 2420ee9ec853..9f807d64b102 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -921,7 +921,7 @@ def test_max_bin_by_feature(self): } lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) - self.assertEqual(len(np.unique(est.predict(X))), 99) + self.assertEqual(len(np.unique(est.predict(X))), 100) params['max_bin_by_feature'] = [2, 100] lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) @@ -1599,7 +1599,7 @@ def test_forced_bins(self): forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/regression/forced_bins.json') params = {'objective': 'regression_l1', - 'max_bin': 6, + 'max_bin': 5, 'forcedbins_filename': forcedbins_filename, 'num_leaves': 2, 'min_data_in_leaf': 1, @@ -1613,7 +1613,7 @@ def test_forced_bins(self): predicted = est.predict(new_x) self.assertEqual(len(np.unique(predicted)), 3) new_x[:, 0] = [0, 0, 0] - new_x[:, 1] = [-0.25, -0.5, -0.9] + new_x[:, 1] = [-0.9, -0.6, -0.3] predicted = est.predict(new_x) self.assertEqual(len(np.unique(predicted)), 1) params['forcedbins_filename'] = '' @@ -1621,3 +1621,28 @@ def test_forced_bins(self): est = lgb.train(params, lgb_x, num_boost_round=100) predicted = est.predict(new_x) self.assertEqual(len(np.unique(predicted)), 3) + + def test_binning_same_sign(self): + # test that binning works properly for features with only positive or only negative values + x = np.zeros((99, 2)) + x[:, 0] = np.arange(0.01, 1, 0.01) + x[:, 1] = -np.arange(0.01, 1, 0.01) + y = np.arange(0.01, 1, 0.01) + params = {'objective': 'regression_l1', + 'max_bin': 5, + 'num_leaves': 2, + 'min_data_in_leaf': 1, + 'verbose': -1, + 'seed': 0} + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + new_x = np.zeros((3, 2)) + new_x[:, 0] = [-1, 0, 1] + predicted = est.predict(new_x) + self.assertAlmostEqual(predicted[0], predicted[1]) + self.assertNotAlmostEqual(predicted[1], predicted[2]) + new_x = np.zeros((3, 2)) + new_x[:, 1] = [-1, 0, 1] + predicted = est.predict(new_x) + self.assertNotAlmostEqual(predicted[0], predicted[1]) + self.assertAlmostEqual(predicted[1], predicted[2])