From fe5c8e2547057c1fa5750bcddd359dd7708fab4b Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 31 Jul 2019 22:47:39 +1000 Subject: [PATCH 01/49] Fix bug where small values of max_bin cause crash. --- src/io/bin.cpp | 23 +++++++++++++++++------ tests/python_package_test/test_engine.py | 19 +++++++++++++++++++ 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 617bdf5bac73..d77a73ef9336 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -177,11 +177,10 @@ namespace LightGBM { left_cnt = num_distinct_values; } - if (left_cnt > 0) { + if ((left_cnt > 0) && (max_bin > 1)) { int left_max_bin = static_cast(static_cast(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1)); left_max_bin = std::max(1, left_max_bin); bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin); - bin_upper_bound.back() = -kZeroThreshold; } int right_start = -1; @@ -192,16 +191,27 @@ namespace LightGBM { } } - if (right_start >= 0) { - int right_max_bin = max_bin - 1 - static_cast(bin_upper_bound.size()); - CHECK(right_max_bin > 0); + if (bin_upper_bound.size() == 0) { + if (max_bin > 1) { + bin_upper_bound.push_back(kZeroThreshold); + } + } else { + bin_upper_bound.back() = -kZeroThreshold; + if (max_bin > 2) { + // create zero bin + bin_upper_bound.push_back(kZeroThreshold); + } + } + + int right_max_bin = max_bin - static_cast(bin_upper_bound.size()); + if ((right_start >= 0) && (right_max_bin > 0)) { auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start, num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin); - bin_upper_bound.push_back(kZeroThreshold); bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end()); } else { bin_upper_bound.push_back(std::numeric_limits::infinity()); } + CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } @@ -280,6 +290,7 @@ namespace LightGBM { } } else if (missing_type_ == MissingType::None) { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + } else { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin); bin_upper_bound_.push_back(NaN); diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4c9a9eddc6c6..475cfebbb8c0 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -901,6 +901,25 @@ def test_max_bin_by_feature(self): est = lgb.train(params, lgb_data, num_boost_round=1) self.assertEqual(len(np.unique(est.predict(X))), 3) + def test_small_max_bin(self): + np.random.seed(0) + y = np.random.choice([0, 1], 100) + x = np.zeros((100, 1)) + x[:30, 0] = -1 + x[30:60, 0] = 1 + x[60:, 0] = 2 + params = {'objective': 'binary', + 'seed': 0, + 'min_data_in_leaf': 1, + 'verbose': -1, + 'max_bin': 2} + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=5) + x[0, 0] = np.nan + params['max_bin'] = 3 + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=5) + def test_refit(self): X, y = load_breast_cancer(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) From 439bcfd0600ae795630b8303ce8e19bc1fc90378 Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 31 Jul 2019 22:53:57 +1000 Subject: [PATCH 02/49] Revert "Fix bug where small values of max_bin cause crash." This reverts commit fe5c8e2547057c1fa5750bcddd359dd7708fab4b. --- src/io/bin.cpp | 23 ++++++----------------- tests/python_package_test/test_engine.py | 19 ------------------- 2 files changed, 6 insertions(+), 36 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index d77a73ef9336..617bdf5bac73 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -177,10 +177,11 @@ namespace LightGBM { left_cnt = num_distinct_values; } - if ((left_cnt > 0) && (max_bin > 1)) { + if (left_cnt > 0) { int left_max_bin = static_cast(static_cast(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1)); left_max_bin = std::max(1, left_max_bin); bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin); + bin_upper_bound.back() = -kZeroThreshold; } int right_start = -1; @@ -191,27 +192,16 @@ namespace LightGBM { } } - if (bin_upper_bound.size() == 0) { - if (max_bin > 1) { - bin_upper_bound.push_back(kZeroThreshold); - } - } else { - bin_upper_bound.back() = -kZeroThreshold; - if (max_bin > 2) { - // create zero bin - bin_upper_bound.push_back(kZeroThreshold); - } - } - - int right_max_bin = max_bin - static_cast(bin_upper_bound.size()); - if ((right_start >= 0) && (right_max_bin > 0)) { + if (right_start >= 0) { + int right_max_bin = max_bin - 1 - static_cast(bin_upper_bound.size()); + CHECK(right_max_bin > 0); auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start, num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin); + bin_upper_bound.push_back(kZeroThreshold); bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end()); } else { bin_upper_bound.push_back(std::numeric_limits::infinity()); } - CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } @@ -290,7 +280,6 @@ namespace LightGBM { } } else if (missing_type_ == MissingType::None) { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); - } else { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin); bin_upper_bound_.push_back(NaN); diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 475cfebbb8c0..4c9a9eddc6c6 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -901,25 +901,6 @@ def test_max_bin_by_feature(self): est = lgb.train(params, lgb_data, num_boost_round=1) self.assertEqual(len(np.unique(est.predict(X))), 3) - def test_small_max_bin(self): - np.random.seed(0) - y = np.random.choice([0, 1], 100) - x = np.zeros((100, 1)) - x[:30, 0] = -1 - x[30:60, 0] = 1 - x[60:, 0] = 2 - params = {'objective': 'binary', - 'seed': 0, - 'min_data_in_leaf': 1, - 'verbose': -1, - 'max_bin': 2} - lgb_x = lgb.Dataset(x, label=y) - est = lgb.train(params, lgb_x, num_boost_round=5) - x[0, 0] = np.nan - params['max_bin'] = 3 - lgb_x = lgb.Dataset(x, label=y) - est = lgb.train(params, lgb_x, num_boost_round=5) - def test_refit(self): X, y = load_breast_cancer(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) From 34e72c87c6d610b4fbf20e30a8bcf01989963a5e Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 13 Aug 2019 18:14:54 +1000 Subject: [PATCH 03/49] Add functionality to force bin thresholds. --- docs/Parameters.rst | 8 +++ include/LightGBM/bin.h | 3 +- include/LightGBM/config.h | 5 ++ include/LightGBM/dataset.h | 3 + src/io/bin.cpp | 86 ++++++++++++++++++------ src/io/config_auto.cpp | 4 ++ src/io/dataset.cpp | 64 +++++++++++++++++- src/io/dataset_loader.cpp | 46 ++++++++++--- tests/data/forced_bins.json | 10 +++ tests/python_package_test/test_engine.py | 32 ++++++++- 10 files changed, 227 insertions(+), 34 deletions(-) create mode 100644 tests/data/forced_bins.json diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 93c241bce215..584237464fd1 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -404,6 +404,14 @@ Learning Control Parameters - see `this file `__ as an example +- ``forcedbins_filename`` :raw-html:`🔗︎`, default = ``""``, type = string + + - path to a ``.json`` file that specifies bin upper bounds for some or all features + + - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + + - see `this file `__ as an example + - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` - decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 46baee58fc46..1c5f62cd1907 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -146,9 +146,10 @@ class BinMapper { * \param bin_type Type of this bin * \param use_missing True to enable missing value handle * \param zero_as_missing True to use zero as missing value + * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm) */ void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, - bool use_missing, bool zero_as_missing); + bool use_missing, bool zero_as_missing, std::vector forced_upper_bounds); /*! * \brief Use specific number of bin to calculate the size of this class diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 08b2a7352c0a..1c0c14f69508 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -402,6 +402,11 @@ struct Config { // desc = see `this file `__ as an example std::string forcedsplits_filename = ""; + // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features + // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + // desc = see `this file `__ as an example + std::string forcedbins_filename = ""; + // check = >=0.0 // check = <=1.0 // desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index e688522fbb1a..900487eafbf4 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -596,6 +596,8 @@ class Dataset { void addFeaturesFrom(Dataset* other); + static std::vector> GetForcedBins(std::string forced_bins_path, int num_total_features); + private: std::string data_filename_; /*! \brief Store used features */ @@ -630,6 +632,7 @@ class Dataset { bool is_finish_load_; int max_bin_; std::vector max_bin_by_feature_; + std::vector> forced_bin_bounds_; int bin_construct_sample_cnt_; int min_data_in_bin_; bool use_missing_; diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 617bdf5bac73..62713d1bddd3 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -150,8 +150,10 @@ namespace LightGBM { } std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, - int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) { + int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector forced_upper_bounds) { std::vector bin_upper_bound; + + // get list of distinct values int left_cnt_data = 0; int cnt_zero = 0; int right_cnt_data = 0; @@ -165,6 +167,7 @@ namespace LightGBM { } } + // get number of positive and negative distinct values int left_cnt = -1; for (int i = 0; i < num_distinct_values; ++i) { if (distinct_values[i] > -kZeroThreshold) { @@ -172,18 +175,9 @@ namespace LightGBM { break; } } - if (left_cnt < 0) { left_cnt = num_distinct_values; } - - if (left_cnt > 0) { - int left_max_bin = static_cast(static_cast(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1)); - left_max_bin = std::max(1, left_max_bin); - bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin); - bin_upper_bound.back() = -kZeroThreshold; - } - int right_start = -1; for (int i = left_cnt; i < num_distinct_values; ++i) { if (distinct_values[i] > kZeroThreshold) { @@ -192,21 +186,66 @@ namespace LightGBM { } } - if (right_start >= 0) { - int right_max_bin = max_bin - 1 - static_cast(bin_upper_bound.size()); - CHECK(right_max_bin > 0); - auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start, - num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin); + // include zero bounds if possible + if (max_bin == 2) { + if (left_cnt == 0) { + bin_upper_bound.push_back(kZeroThreshold); + } else { + bin_upper_bound.push_back(-kZeroThreshold); + } + } else if (max_bin >= 3) { + bin_upper_bound.push_back(-kZeroThreshold); bin_upper_bound.push_back(kZeroThreshold); - bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end()); - } else { - bin_upper_bound.push_back(std::numeric_limits::infinity()); } + + // add forced bounds, excluding zeros since we have already added zero bounds + int i = 0; + while (i < forced_upper_bounds.size()) { + if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) { + forced_upper_bounds.erase(forced_upper_bounds.begin() + i); + } else { + ++i; + } + } + bin_upper_bound.push_back(std::numeric_limits::infinity()); + int max_to_insert = max_bin - static_cast(bin_upper_bound.size()); + int num_to_insert = std::min(max_to_insert, static_cast(forced_upper_bounds.size())); + if (num_to_insert > 0) { + bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); + } + std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + + // find remaining bounds + std::vector bounds_to_add; + int value_ind = 0; + for (int i = 0; i < bin_upper_bound.size(); ++i) { + int cnt_in_bin = 0; + int distinct_cnt_in_bin = 0; + int bin_start = value_ind; + while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) { + cnt_in_bin += counts[value_ind]; + ++distinct_cnt_in_bin; + ++value_ind; + } + int bins_remaining = max_bin - static_cast(bin_upper_bound.size()) - static_cast(bounds_to_add.size()); + int num_sub_bins = static_cast(std::lround((static_cast(cnt_in_bin) * bins_remaining / total_sample_cnt))); + num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1; + if (i == bin_upper_bound.size() - 1) { + num_sub_bins = bins_remaining + 1; + } + std::vector new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, + num_sub_bins, cnt_in_bin, min_data_in_bin); + bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity + } + bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); + std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt, - int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) { + int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, + std::vector forced_upper_bounds) { int na_cnt = 0; int tmp_num_sample_values = 0; for (int i = 0; i < num_sample_values; ++i) { @@ -274,14 +313,17 @@ namespace LightGBM { int num_distinct_values = static_cast(distinct_values.size()); if (bin_type_ == BinType::NumericalBin) { if (missing_type_ == MissingType::Zero) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); if (bin_upper_bound_.size() == 2) { missing_type_ = MissingType::None; } } else if (missing_type_ == MissingType::None) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); } else { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, + min_data_in_bin, forced_upper_bounds); bin_upper_bound_.push_back(NaN); } num_bin_ = static_cast(bin_upper_bound_.size()); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 8d75b1cde3df..ad5b43811ebe 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -211,6 +211,7 @@ std::unordered_set Config::parameter_set({ "monotone_constraints", "feature_contri", "forcedsplits_filename", + "forcedbins_filename", "refit_decay_rate", "cegb_tradeoff", "cegb_penalty_split", @@ -396,6 +397,8 @@ void Config::GetMembersFromString(const std::unordered_map=0.0); CHECK(refit_decay_rate <=1.0); @@ -608,6 +611,7 @@ std::string Config::SaveMembersToString() const { str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast(monotone_constraints), ",") << "]\n"; str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n"; str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n"; + str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n"; str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n"; str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n"; str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n"; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index f201a40a1a7a..c931e945cd24 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -8,12 +8,17 @@ #include #include #include +#include #include #include #include #include #include +#include + +using namespace json11; + namespace LightGBM { @@ -324,6 +329,7 @@ void Dataset::Construct( max_bin_by_feature_.resize(num_total_features_); max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end()); } + forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_); max_bin_ = io_config.max_bin; min_data_in_bin_ = io_config.min_data_in_bin; bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt; @@ -356,6 +362,12 @@ void Dataset::ResetConfig(const char* parameters) { if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) { Log::Warning("Cannot change sparse_threshold after constructed Dataset handle."); } + if (param.count("forcedbins_filename")) { + std::vector> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_); + if (config_bounds != forced_bin_bounds_) { + Log::Warning("Cannot change forced bins after constructed Dataset handle."); + } + } if (!io_config.monotone_constraints.empty()) { CHECK(static_cast(num_total_features_) == io_config.monotone_constraints.size()); @@ -657,6 +669,10 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { for (int i = 0; i < num_total_features_; ++i) { size_of_header += feature_names_[i].size() + sizeof(int); } + // size of forced bins + for (int i = 0; i < num_total_features_; ++i) { + size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int); + } writer->Write(&size_of_header, sizeof(size_of_header)); // write header writer->Write(&num_data_, sizeof(num_data_)); @@ -705,6 +721,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { const char* c_str = feature_names_[i].c_str(); writer->Write(c_str, sizeof(char) * str_len); } + // write forced bins + for (int i = 0; i < num_total_features_; ++i) { + int num_bounds = static_cast(forced_bin_bounds_[i].size()); + writer->Write(&num_bounds, sizeof(int)); + + for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) { + writer->Write(&forced_bin_bounds_[i][j], sizeof(double)); + } + } // get size of meta data size_t size_of_metadata = metadata_.SizesInByte(); @@ -754,6 +779,13 @@ void Dataset::DumpTextFile(const char* text_filename) { for (auto n : feature_names_) { fprintf(file, "%s, ", n.c_str()); } + fprintf(file, "\nforced_bins: "); + for (int i = 0; i < num_total_features_; ++i) { + fprintf(file, "\nfeature %d: ", i); + for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) { + fprintf(file, "%lf, ", forced_bin_bounds_[i][j]); + } + } std::vector> iterators; iterators.reserve(num_features_); for (int j = 0; j < num_features_; ++j) { @@ -1005,6 +1037,7 @@ void Dataset::addFeaturesFrom(Dataset* other) { PushVector(feature_names_, other->feature_names_); PushVector(feature2subfeature_, other->feature2subfeature_); PushVector(group_feature_cnt_, other->group_feature_cnt_); + PushVector(forced_bin_bounds_, other->forced_bin_bounds_); feature_groups_.reserve(other->feature_groups_.size()); for (auto& fg : other->feature_groups_) { feature_groups_.emplace_back(new FeatureGroup(*fg)); @@ -1027,10 +1060,39 @@ void Dataset::addFeaturesFrom(Dataset* other) { PushClearIfEmpty(monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0); PushClearIfEmpty(feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0); - + PushClearIfEmpty(max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1); num_features_ += other->num_features_; num_total_features_ += other->num_total_features_; num_groups_ += other->num_groups_; } + +std::vector> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features) { + std::vector> forced_bins(num_total_features, std::vector()); + if (forced_bins_path != "") { + std::ifstream forced_bins_stream(forced_bins_path.c_str()); + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); + } + } + return forced_bins; +} + + } // namespace LightGBM diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 1130d803ea36..f36d5b1df27d 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -3,7 +3,6 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #include - #include #include #include @@ -458,6 +457,21 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b } dataset->feature_names_.emplace_back(str_buf.str()); } + // get forced_bin_bounds_ + dataset->forced_bin_bounds_ = std::vector>(dataset->num_total_features_, std::vector()); + for (int i = 0; i < dataset->num_total_features_; ++i) { + int num_bounds = *(reinterpret_cast(mem_ptr)); + mem_ptr += sizeof(int); + dataset->forced_bin_bounds_[i] = std::vector(); + const double* tmp_ptr_forced_bounds = reinterpret_cast(mem_ptr); + + for (int j = 0; j < num_bounds; ++j) { + double bound = tmp_ptr_forced_bounds[j]; + dataset->forced_bin_bounds_[i].push_back(bound); + } + mem_ptr += num_bounds * sizeof(double); + + } // read size of meta data read_cnt = reader->Read(buffer.data(), sizeof(size_t)); @@ -549,6 +563,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b return dataset.release(); } + Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, int** sample_indices, int num_col, const int* num_per_col, size_t total_sample_size, data_size_t num_data) { @@ -565,6 +580,11 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, CHECK(static_cast(num_col) == config_.max_bin_by_feature.size()); CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1); } + + // get forced split + std::string forced_bins_path = config_.forcedbins_filename; + std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col); + const data_size_t filter_cnt = static_cast( static_cast(config_.min_data_in_leaf * total_sample_size) / num_data); if (Network::num_machines() == 1) { @@ -585,12 +605,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size, config_.max_bin, config_.min_data_in_bin, filter_cnt, - bin_type, config_.use_missing, config_.zero_as_missing); + bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size, config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -630,12 +651,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i], total_sample_size, config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i], total_sample_size, config_.max_bin_by_feature[start[rank] + i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -872,6 +894,10 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1); } + // get forced split + std::string forced_bins_path = config_.forcedbins_filename; + std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_); + // check the range of label_idx, weight_idx and group_idx CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_); CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_); @@ -909,12 +935,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[i].data(), static_cast(sample_values[i].size()), sample_data.size(), config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[i].data(), static_cast(sample_values[i].size()), sample_data.size(), config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -955,13 +982,14 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(), static_cast(sample_values[start[rank] + i].size()), sample_data.size(), config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(), static_cast(sample_values[start[rank] + i].size()), sample_data.size(), config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, - config_.use_missing, config_.zero_as_missing); + config_.use_missing, config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json new file mode 100644 index 000000000000..aa74c36ffb78 --- /dev/null +++ b/tests/data/forced_bins.json @@ -0,0 +1,10 @@ +[ + { + "feature": 0, + "bin_upper_bound": [ 0.3, 0.35, 0.4 ] + }, + { + "feature": 1, + "bin_upper_bound": [ -0.1, -0.15, -0.2 ] + } +] \ No newline at end of file diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4c9a9eddc6c6..59ea0113f50a 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -895,7 +895,7 @@ def test_max_bin_by_feature(self): } lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) - self.assertEqual(len(np.unique(est.predict(X))), 100) + self.assertEqual(len(np.unique(est.predict(X))), 99) params['max_bin_by_feature'] = [2, 100] lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) @@ -1544,3 +1544,33 @@ def constant_metric(preds, train_data): decreasing_metric(preds, train_data)], early_stopping_rounds=5, verbose_eval=False) self.assertEqual(gbm.best_iteration, 1) + + def test_forced_bins(self): + x = np.zeros((100, 2)) + x[:, 0] = np.arange(0, 1, 0.01) + x[:, 1] = -np.arange(0, 1, 0.01) + y = np.arange(0, 1, 0.01) + forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json') + params = {'objective': 'regression_l1', + 'max_bin': 6, + 'forcedbins_filename': forcedbins_filename, + 'num_leaves': 2, + 'min_data_in_leaf': 1, + 'verbose': -1, + 'seed': 0} + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + new_x = np.zeros((3, x.shape[1])) + new_x[:, 0] = [0.31, 0.37, 0.41] + new_x[:, 1] = [0, 0, 0] + predicted = est.predict(new_x) + self.assertEqual(len(np.unique(predicted)), 3) + new_x[:, 0] = [0, 0, 0] + new_x[:, 1] = [-0.25, -0.5, -0.9] + predicted = est.predict(new_x) + self.assertEqual(len(np.unique(predicted)), 1) + params['forcedbins_filename'] = '' + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + predicted = est.predict(new_x) + self.assertEqual(len(np.unique(predicted)), 3) From 5b21573ecb4dd9e463e47783b9a8309f000c6bf2 Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:10:21 +1000 Subject: [PATCH 04/49] Fix style issues. --- docs/Parameters.rst | 4 +- .../regression}/forced_bins.json | 2 +- examples/regression/train.conf | 3 ++ include/LightGBM/config.h | 4 +- src/io/bin.cpp | 2 +- src/io/dataset.cpp | 40 ++++++++++--------- src/io/dataset_loader.cpp | 1 + tests/python_package_test/test_engine.py | 3 +- 8 files changed, 34 insertions(+), 25 deletions(-) rename {tests/data => examples/regression}/forced_bins.json (98%) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 584237464fd1..83a04b992393 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -408,9 +408,9 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) - - see `this file `__ as an example + - see `this file `__ as an example - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` diff --git a/tests/data/forced_bins.json b/examples/regression/forced_bins.json similarity index 98% rename from tests/data/forced_bins.json rename to examples/regression/forced_bins.json index aa74c36ffb78..1ee0a49d727c 100644 --- a/tests/data/forced_bins.json +++ b/examples/regression/forced_bins.json @@ -7,4 +7,4 @@ "feature": 1, "bin_upper_bound": [ -0.1, -0.15, -0.2 ] } -] \ No newline at end of file +] diff --git a/examples/regression/train.conf b/examples/regression/train.conf index 11396c23ecc2..4c73169dc8f9 100644 --- a/examples/regression/train.conf +++ b/examples/regression/train.conf @@ -29,6 +29,9 @@ is_training_metric = true # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. max_bin = 255 +# forced bin thresholds +# forcedbins_filename = forced_bins.json + # training data # if exsting weight file, should name to "regression.train.weight" # alias: train_data, train diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 1c0c14f69508..89fa57453c88 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -403,8 +403,8 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) - // desc = see `this file `__ as an example + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = see `this file `__ as an example std::string forcedbins_filename = ""; // check = >=0.0 diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 62713d1bddd3..2556a59b4715 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -320,7 +320,7 @@ namespace LightGBM { } } else if (missing_type_ == MissingType::None) { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); + min_data_in_bin, forced_upper_bounds); } else { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin, forced_upper_bounds); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index c931e945cd24..269c06c4c37d 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -5,10 +5,10 @@ #include #include +#include #include #include #include -#include #include #include @@ -1071,24 +1071,28 @@ std::vector> Dataset::GetForcedBins(std::string forced_bins_ std::vector> forced_bins(num_total_features, std::vector()); if (forced_bins_path != "") { std::ifstream forced_bins_stream(forced_bins_path.c_str()); - std::stringstream buffer; - buffer << forced_bins_stream.rdbuf(); - std::string err; - Json forced_bins_json = Json::parse(buffer.str(), err); - CHECK(forced_bins_json.is_array()); - std::vector forced_bins_arr = forced_bins_json.array_items(); - for (int i = 0; i < forced_bins_arr.size(); ++i) { - int feature_num = forced_bins_arr[i]["feature"].int_value(); - CHECK(feature_num < num_total_features); - std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); - for (int j = 0; j < bounds_arr.size(); ++j) { - forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + if (forced_bins_stream.fail()) { + Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str()); + } else { + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); } - } - // remove duplicates - for (int i = 0; i < num_total_features; ++i) { - auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); - forced_bins[i].erase(new_end, forced_bins[i].end()); } } return forced_bins; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index f36d5b1df27d..eb83d74bfe3d 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,6 +2,7 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + #include #include #include diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 59ea0113f50a..d55bac7711a1 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1550,7 +1550,8 @@ def test_forced_bins(self): x[:, 0] = np.arange(0, 1, 0.01) x[:, 1] = -np.arange(0, 1, 0.01) y = np.arange(0, 1, 0.01) - forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json') + forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../../examples/regression/forced_bins.json') params = {'objective': 'regression_l1', 'max_bin': 6, 'forcedbins_filename': forcedbins_filename, From 2be599af63dbe4d750c12cdad1737fae4628c64d Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:19:58 +1000 Subject: [PATCH 05/49] Use stable sort. --- src/io/bin.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 2556a59b4715..b26a6a461e3e 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -213,7 +213,7 @@ namespace LightGBM { if (num_to_insert > 0) { bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); } - std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); // find remaining bounds std::vector bounds_to_add; @@ -238,7 +238,7 @@ namespace LightGBM { bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); - std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } From 6a098f0f432db9371fb445357f4a92543490a5cb Mon Sep 17 00:00:00 2001 From: btrotta Date: Thu, 15 Aug 2019 19:17:19 +1000 Subject: [PATCH 06/49] Minor style and doc fixes. --- docs/Parameters.rst | 2 +- include/LightGBM/config.h | 2 +- src/io/dataset_loader.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 83a04b992393..d6f8a2a8e118 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -408,7 +408,7 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) - see `this file `__ as an example diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 89fa57453c88..0a621f0036d0 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -403,7 +403,7 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) // desc = see `this file `__ as an example std::string forcedbins_filename = ""; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index eb83d74bfe3d..c00b9b7fdae5 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,8 +2,8 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - #include + #include #include #include From 8f736369106564377bd02e496b31e16a5e894797 Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 13 Aug 2019 18:14:54 +1000 Subject: [PATCH 07/49] Add functionality to force bin thresholds. --- docs/Parameters.rst | 8 ++ include/LightGBM/bin.h | 3 +- include/LightGBM/config.h | 5 ++ include/LightGBM/dataset.h | 3 + src/io/bin.cpp | 93 +++++++++++++++--------- src/io/config_auto.cpp | 4 + src/io/dataset.cpp | 64 +++++++++++++++- src/io/dataset_loader.cpp | 46 +++++++++--- tests/data/forced_bins.json | 10 +++ tests/python_package_test/test_engine.py | 30 ++++++++ 10 files changed, 222 insertions(+), 44 deletions(-) create mode 100644 tests/data/forced_bins.json diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 8c16e190d223..10105bfbed5a 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -404,6 +404,14 @@ Learning Control Parameters - see `this file `__ as an example +- ``forcedbins_filename`` :raw-html:`🔗︎`, default = ``""``, type = string + + - path to a ``.json`` file that specifies bin upper bounds for some or all features + + - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + + - see `this file `__ as an example + - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` - decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 46baee58fc46..1c5f62cd1907 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -146,9 +146,10 @@ class BinMapper { * \param bin_type Type of this bin * \param use_missing True to enable missing value handle * \param zero_as_missing True to use zero as missing value + * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm) */ void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, - bool use_missing, bool zero_as_missing); + bool use_missing, bool zero_as_missing, std::vector forced_upper_bounds); /*! * \brief Use specific number of bin to calculate the size of this class diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 190e239cf5a7..d2a953ddb416 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -402,6 +402,11 @@ struct Config { // desc = see `this file `__ as an example std::string forcedsplits_filename = ""; + // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features + // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + // desc = see `this file `__ as an example + std::string forcedbins_filename = ""; + // check = >=0.0 // check = <=1.0 // desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index e688522fbb1a..900487eafbf4 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -596,6 +596,8 @@ class Dataset { void addFeaturesFrom(Dataset* other); + static std::vector> GetForcedBins(std::string forced_bins_path, int num_total_features); + private: std::string data_filename_; /*! \brief Store used features */ @@ -630,6 +632,7 @@ class Dataset { bool is_finish_load_; int max_bin_; std::vector max_bin_by_feature_; + std::vector> forced_bin_bounds_; int bin_construct_sample_cnt_; int min_data_in_bin_; bool use_missing_; diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 9b105e282923..62713d1bddd3 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -150,8 +150,10 @@ namespace LightGBM { } std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, - int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) { + int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector forced_upper_bounds) { std::vector bin_upper_bound; + + // get list of distinct values int left_cnt_data = 0; int cnt_zero = 0; int right_cnt_data = 0; @@ -165,6 +167,7 @@ namespace LightGBM { } } + // get number of positive and negative distinct values int left_cnt = -1; for (int i = 0; i < num_distinct_values; ++i) { if (distinct_values[i] > -kZeroThreshold) { @@ -172,17 +175,9 @@ namespace LightGBM { break; } } - if (left_cnt < 0) { left_cnt = num_distinct_values; } - - if ((left_cnt > 0) && (max_bin > 1)) { - int left_max_bin = static_cast(static_cast(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1)); - left_max_bin = std::max(1, left_max_bin); - bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin); - } - int right_start = -1; for (int i = left_cnt; i < num_distinct_values; ++i) { if (distinct_values[i] > kZeroThreshold) { @@ -191,37 +186,66 @@ namespace LightGBM { } } - if (bin_upper_bound.size() == 0) { - if (max_bin > 2) { - // create zero bin - bin_upper_bound.push_back(-kZeroThreshold); - bin_upper_bound.push_back(kZeroThreshold); - } - else if (max_bin > 1) { + // include zero bounds if possible + if (max_bin == 2) { + if (left_cnt == 0) { bin_upper_bound.push_back(kZeroThreshold); + } else { + bin_upper_bound.push_back(-kZeroThreshold); } - } else { - bin_upper_bound.back() = -kZeroThreshold; - if (max_bin > 2) { - // create zero bin - bin_upper_bound.push_back(kZeroThreshold); + } else if (max_bin >= 3) { + bin_upper_bound.push_back(-kZeroThreshold); + bin_upper_bound.push_back(kZeroThreshold); + } + + // add forced bounds, excluding zeros since we have already added zero bounds + int i = 0; + while (i < forced_upper_bounds.size()) { + if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) { + forced_upper_bounds.erase(forced_upper_bounds.begin() + i); + } else { + ++i; } } - - int right_max_bin = max_bin - static_cast(bin_upper_bound.size()); - if ((right_start >= 0) && (right_max_bin > 0)) { - auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start, - num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin); - bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end()); - } else { - bin_upper_bound.push_back(std::numeric_limits::infinity()); + bin_upper_bound.push_back(std::numeric_limits::infinity()); + int max_to_insert = max_bin - static_cast(bin_upper_bound.size()); + int num_to_insert = std::min(max_to_insert, static_cast(forced_upper_bounds.size())); + if (num_to_insert > 0) { + bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); + } + std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + + // find remaining bounds + std::vector bounds_to_add; + int value_ind = 0; + for (int i = 0; i < bin_upper_bound.size(); ++i) { + int cnt_in_bin = 0; + int distinct_cnt_in_bin = 0; + int bin_start = value_ind; + while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) { + cnt_in_bin += counts[value_ind]; + ++distinct_cnt_in_bin; + ++value_ind; + } + int bins_remaining = max_bin - static_cast(bin_upper_bound.size()) - static_cast(bounds_to_add.size()); + int num_sub_bins = static_cast(std::lround((static_cast(cnt_in_bin) * bins_remaining / total_sample_cnt))); + num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1; + if (i == bin_upper_bound.size() - 1) { + num_sub_bins = bins_remaining + 1; + } + std::vector new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, + num_sub_bins, cnt_in_bin, min_data_in_bin); + bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity } + bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); + std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt, - int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) { + int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, + std::vector forced_upper_bounds) { int na_cnt = 0; int tmp_num_sample_values = 0; for (int i = 0; i < num_sample_values; ++i) { @@ -289,14 +313,17 @@ namespace LightGBM { int num_distinct_values = static_cast(distinct_values.size()); if (bin_type_ == BinType::NumericalBin) { if (missing_type_ == MissingType::Zero) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); if (bin_upper_bound_.size() == 2) { missing_type_ = MissingType::None; } } else if (missing_type_ == MissingType::None) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); } else { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, + min_data_in_bin, forced_upper_bounds); bin_upper_bound_.push_back(NaN); } num_bin_ = static_cast(bin_upper_bound_.size()); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 8d75b1cde3df..ad5b43811ebe 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -211,6 +211,7 @@ std::unordered_set Config::parameter_set({ "monotone_constraints", "feature_contri", "forcedsplits_filename", + "forcedbins_filename", "refit_decay_rate", "cegb_tradeoff", "cegb_penalty_split", @@ -396,6 +397,8 @@ void Config::GetMembersFromString(const std::unordered_map=0.0); CHECK(refit_decay_rate <=1.0); @@ -608,6 +611,7 @@ std::string Config::SaveMembersToString() const { str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast(monotone_constraints), ",") << "]\n"; str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n"; str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n"; + str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n"; str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n"; str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n"; str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n"; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index f201a40a1a7a..c931e945cd24 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -8,12 +8,17 @@ #include #include #include +#include #include #include #include #include #include +#include + +using namespace json11; + namespace LightGBM { @@ -324,6 +329,7 @@ void Dataset::Construct( max_bin_by_feature_.resize(num_total_features_); max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end()); } + forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_); max_bin_ = io_config.max_bin; min_data_in_bin_ = io_config.min_data_in_bin; bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt; @@ -356,6 +362,12 @@ void Dataset::ResetConfig(const char* parameters) { if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) { Log::Warning("Cannot change sparse_threshold after constructed Dataset handle."); } + if (param.count("forcedbins_filename")) { + std::vector> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_); + if (config_bounds != forced_bin_bounds_) { + Log::Warning("Cannot change forced bins after constructed Dataset handle."); + } + } if (!io_config.monotone_constraints.empty()) { CHECK(static_cast(num_total_features_) == io_config.monotone_constraints.size()); @@ -657,6 +669,10 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { for (int i = 0; i < num_total_features_; ++i) { size_of_header += feature_names_[i].size() + sizeof(int); } + // size of forced bins + for (int i = 0; i < num_total_features_; ++i) { + size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int); + } writer->Write(&size_of_header, sizeof(size_of_header)); // write header writer->Write(&num_data_, sizeof(num_data_)); @@ -705,6 +721,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { const char* c_str = feature_names_[i].c_str(); writer->Write(c_str, sizeof(char) * str_len); } + // write forced bins + for (int i = 0; i < num_total_features_; ++i) { + int num_bounds = static_cast(forced_bin_bounds_[i].size()); + writer->Write(&num_bounds, sizeof(int)); + + for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) { + writer->Write(&forced_bin_bounds_[i][j], sizeof(double)); + } + } // get size of meta data size_t size_of_metadata = metadata_.SizesInByte(); @@ -754,6 +779,13 @@ void Dataset::DumpTextFile(const char* text_filename) { for (auto n : feature_names_) { fprintf(file, "%s, ", n.c_str()); } + fprintf(file, "\nforced_bins: "); + for (int i = 0; i < num_total_features_; ++i) { + fprintf(file, "\nfeature %d: ", i); + for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) { + fprintf(file, "%lf, ", forced_bin_bounds_[i][j]); + } + } std::vector> iterators; iterators.reserve(num_features_); for (int j = 0; j < num_features_; ++j) { @@ -1005,6 +1037,7 @@ void Dataset::addFeaturesFrom(Dataset* other) { PushVector(feature_names_, other->feature_names_); PushVector(feature2subfeature_, other->feature2subfeature_); PushVector(group_feature_cnt_, other->group_feature_cnt_); + PushVector(forced_bin_bounds_, other->forced_bin_bounds_); feature_groups_.reserve(other->feature_groups_.size()); for (auto& fg : other->feature_groups_) { feature_groups_.emplace_back(new FeatureGroup(*fg)); @@ -1027,10 +1060,39 @@ void Dataset::addFeaturesFrom(Dataset* other) { PushClearIfEmpty(monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0); PushClearIfEmpty(feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0); - + PushClearIfEmpty(max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1); num_features_ += other->num_features_; num_total_features_ += other->num_total_features_; num_groups_ += other->num_groups_; } + +std::vector> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features) { + std::vector> forced_bins(num_total_features, std::vector()); + if (forced_bins_path != "") { + std::ifstream forced_bins_stream(forced_bins_path.c_str()); + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); + } + } + return forced_bins; +} + + } // namespace LightGBM diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 1130d803ea36..f36d5b1df27d 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -3,7 +3,6 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #include - #include #include #include @@ -458,6 +457,21 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b } dataset->feature_names_.emplace_back(str_buf.str()); } + // get forced_bin_bounds_ + dataset->forced_bin_bounds_ = std::vector>(dataset->num_total_features_, std::vector()); + for (int i = 0; i < dataset->num_total_features_; ++i) { + int num_bounds = *(reinterpret_cast(mem_ptr)); + mem_ptr += sizeof(int); + dataset->forced_bin_bounds_[i] = std::vector(); + const double* tmp_ptr_forced_bounds = reinterpret_cast(mem_ptr); + + for (int j = 0; j < num_bounds; ++j) { + double bound = tmp_ptr_forced_bounds[j]; + dataset->forced_bin_bounds_[i].push_back(bound); + } + mem_ptr += num_bounds * sizeof(double); + + } // read size of meta data read_cnt = reader->Read(buffer.data(), sizeof(size_t)); @@ -549,6 +563,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b return dataset.release(); } + Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, int** sample_indices, int num_col, const int* num_per_col, size_t total_sample_size, data_size_t num_data) { @@ -565,6 +580,11 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, CHECK(static_cast(num_col) == config_.max_bin_by_feature.size()); CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1); } + + // get forced split + std::string forced_bins_path = config_.forcedbins_filename; + std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col); + const data_size_t filter_cnt = static_cast( static_cast(config_.min_data_in_leaf * total_sample_size) / num_data); if (Network::num_machines() == 1) { @@ -585,12 +605,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size, config_.max_bin, config_.min_data_in_bin, filter_cnt, - bin_type, config_.use_missing, config_.zero_as_missing); + bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size, config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -630,12 +651,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i], total_sample_size, config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i], total_sample_size, config_.max_bin_by_feature[start[rank] + i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -872,6 +894,10 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1); } + // get forced split + std::string forced_bins_path = config_.forcedbins_filename; + std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_); + // check the range of label_idx, weight_idx and group_idx CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_); CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_); @@ -909,12 +935,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[i].data(), static_cast(sample_values[i].size()), sample_data.size(), config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[i].data(), static_cast(sample_values[i].size()), sample_data.size(), config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -955,13 +982,14 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(), static_cast(sample_values[start[rank] + i].size()), sample_data.size(), config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(), static_cast(sample_values[start[rank] + i].size()), sample_data.size(), config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, - config_.use_missing, config_.zero_as_missing); + config_.use_missing, config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json new file mode 100644 index 000000000000..aa74c36ffb78 --- /dev/null +++ b/tests/data/forced_bins.json @@ -0,0 +1,10 @@ +[ + { + "feature": 0, + "bin_upper_bound": [ 0.3, 0.35, 0.4 ] + }, + { + "feature": 1, + "bin_upper_bound": [ -0.1, -0.15, -0.2 ] + } +] \ No newline at end of file diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 2039742dc9ff..4eb1e2cb8e38 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1590,3 +1590,33 @@ def constant_metric(preds, train_data): decreasing_metric(preds, train_data)], early_stopping_rounds=5, verbose_eval=False) self.assertEqual(gbm.best_iteration, 1) + + def test_forced_bins(self): + x = np.zeros((100, 2)) + x[:, 0] = np.arange(0, 1, 0.01) + x[:, 1] = -np.arange(0, 1, 0.01) + y = np.arange(0, 1, 0.01) + forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json') + params = {'objective': 'regression_l1', + 'max_bin': 6, + 'forcedbins_filename': forcedbins_filename, + 'num_leaves': 2, + 'min_data_in_leaf': 1, + 'verbose': -1, + 'seed': 0} + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + new_x = np.zeros((3, x.shape[1])) + new_x[:, 0] = [0.31, 0.37, 0.41] + new_x[:, 1] = [0, 0, 0] + predicted = est.predict(new_x) + self.assertEqual(len(np.unique(predicted)), 3) + new_x[:, 0] = [0, 0, 0] + new_x[:, 1] = [-0.25, -0.5, -0.9] + predicted = est.predict(new_x) + self.assertEqual(len(np.unique(predicted)), 1) + params['forcedbins_filename'] = '' + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + predicted = est.predict(new_x) + self.assertEqual(len(np.unique(predicted)), 3) From 6c2d048c79075be9f124d46553715dbeae06471d Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:10:21 +1000 Subject: [PATCH 08/49] Fix style issues. --- docs/Parameters.rst | 4 +- .../regression}/forced_bins.json | 2 +- examples/regression/train.conf | 3 ++ include/LightGBM/config.h | 4 +- src/io/bin.cpp | 2 +- src/io/dataset.cpp | 40 ++++++++++--------- src/io/dataset_loader.cpp | 1 + tests/python_package_test/test_engine.py | 3 +- 8 files changed, 34 insertions(+), 25 deletions(-) rename {tests/data => examples/regression}/forced_bins.json (98%) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 10105bfbed5a..c4f45f0010c4 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -408,9 +408,9 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) - - see `this file `__ as an example + - see `this file `__ as an example - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` diff --git a/tests/data/forced_bins.json b/examples/regression/forced_bins.json similarity index 98% rename from tests/data/forced_bins.json rename to examples/regression/forced_bins.json index aa74c36ffb78..1ee0a49d727c 100644 --- a/tests/data/forced_bins.json +++ b/examples/regression/forced_bins.json @@ -7,4 +7,4 @@ "feature": 1, "bin_upper_bound": [ -0.1, -0.15, -0.2 ] } -] \ No newline at end of file +] diff --git a/examples/regression/train.conf b/examples/regression/train.conf index 11396c23ecc2..4c73169dc8f9 100644 --- a/examples/regression/train.conf +++ b/examples/regression/train.conf @@ -29,6 +29,9 @@ is_training_metric = true # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. max_bin = 255 +# forced bin thresholds +# forcedbins_filename = forced_bins.json + # training data # if exsting weight file, should name to "regression.train.weight" # alias: train_data, train diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index d2a953ddb416..56903a9b96ae 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -403,8 +403,8 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) - // desc = see `this file `__ as an example + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = see `this file `__ as an example std::string forcedbins_filename = ""; // check = >=0.0 diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 62713d1bddd3..2556a59b4715 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -320,7 +320,7 @@ namespace LightGBM { } } else if (missing_type_ == MissingType::None) { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); + min_data_in_bin, forced_upper_bounds); } else { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin, forced_upper_bounds); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index c931e945cd24..269c06c4c37d 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -5,10 +5,10 @@ #include #include +#include #include #include #include -#include #include #include @@ -1071,24 +1071,28 @@ std::vector> Dataset::GetForcedBins(std::string forced_bins_ std::vector> forced_bins(num_total_features, std::vector()); if (forced_bins_path != "") { std::ifstream forced_bins_stream(forced_bins_path.c_str()); - std::stringstream buffer; - buffer << forced_bins_stream.rdbuf(); - std::string err; - Json forced_bins_json = Json::parse(buffer.str(), err); - CHECK(forced_bins_json.is_array()); - std::vector forced_bins_arr = forced_bins_json.array_items(); - for (int i = 0; i < forced_bins_arr.size(); ++i) { - int feature_num = forced_bins_arr[i]["feature"].int_value(); - CHECK(feature_num < num_total_features); - std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); - for (int j = 0; j < bounds_arr.size(); ++j) { - forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + if (forced_bins_stream.fail()) { + Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str()); + } else { + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); } - } - // remove duplicates - for (int i = 0; i < num_total_features; ++i) { - auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); - forced_bins[i].erase(new_end, forced_bins[i].end()); } } return forced_bins; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index f36d5b1df27d..eb83d74bfe3d 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,6 +2,7 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + #include #include #include diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4eb1e2cb8e38..2420ee9ec853 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1596,7 +1596,8 @@ def test_forced_bins(self): x[:, 0] = np.arange(0, 1, 0.01) x[:, 1] = -np.arange(0, 1, 0.01) y = np.arange(0, 1, 0.01) - forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json') + forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../../examples/regression/forced_bins.json') params = {'objective': 'regression_l1', 'max_bin': 6, 'forcedbins_filename': forcedbins_filename, From feb861f3f326787eab3e92e2945a6a8d9fdbd16b Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:19:58 +1000 Subject: [PATCH 09/49] Use stable sort. --- src/io/bin.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 2556a59b4715..b26a6a461e3e 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -213,7 +213,7 @@ namespace LightGBM { if (num_to_insert > 0) { bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); } - std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); // find remaining bounds std::vector bounds_to_add; @@ -238,7 +238,7 @@ namespace LightGBM { bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); - std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } From 873fa64a2ba3ace8ad94271a4bbd37c0c3a6add5 Mon Sep 17 00:00:00 2001 From: btrotta Date: Thu, 15 Aug 2019 19:17:19 +1000 Subject: [PATCH 10/49] Minor style and doc fixes. --- docs/Parameters.rst | 2 +- include/LightGBM/config.h | 2 +- src/io/dataset_loader.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index c4f45f0010c4..28777637d100 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -408,7 +408,7 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) - see `this file `__ as an example diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 56903a9b96ae..b67ee9656468 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -403,7 +403,7 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) // desc = see `this file `__ as an example std::string forcedbins_filename = ""; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index eb83d74bfe3d..c00b9b7fdae5 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,8 +2,8 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - #include + #include #include #include From 4cd89e48bba33bf0ae3978bb530a214963d0d59b Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 20 Aug 2019 21:26:06 +1000 Subject: [PATCH 11/49] Change binning behavior to be same as PR #2342. --- src/io/bin.cpp | 14 +++++++---- tests/python_package_test/test_engine.py | 31 +++++++++++++++++++++--- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index b26a6a461e3e..40da30c6ad2d 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -186,7 +186,7 @@ namespace LightGBM { } } - // include zero bounds if possible + // include zero bounds and infinity bound if (max_bin == 2) { if (left_cnt == 0) { bin_upper_bound.push_back(kZeroThreshold); @@ -194,9 +194,14 @@ namespace LightGBM { bin_upper_bound.push_back(-kZeroThreshold); } } else if (max_bin >= 3) { - bin_upper_bound.push_back(-kZeroThreshold); - bin_upper_bound.push_back(kZeroThreshold); + if (left_cnt > 0) { + bin_upper_bound.push_back(-kZeroThreshold); + } + if (right_start >= 0) { + bin_upper_bound.push_back(kZeroThreshold); + } } + bin_upper_bound.push_back(std::numeric_limits::infinity()); // add forced bounds, excluding zeros since we have already added zero bounds int i = 0; @@ -207,7 +212,6 @@ namespace LightGBM { ++i; } } - bin_upper_bound.push_back(std::numeric_limits::infinity()); int max_to_insert = max_bin - static_cast(bin_upper_bound.size()); int num_to_insert = std::min(max_to_insert, static_cast(forced_upper_bounds.size())); if (num_to_insert > 0) { @@ -239,7 +243,7 @@ namespace LightGBM { } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); - CHECK(bin_upper_bound.size() <= max_bin); + CHECK(bin_upper_bound.size() <= static_cast(max_bin)); return bin_upper_bound; } diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 2420ee9ec853..9f807d64b102 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -921,7 +921,7 @@ def test_max_bin_by_feature(self): } lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) - self.assertEqual(len(np.unique(est.predict(X))), 99) + self.assertEqual(len(np.unique(est.predict(X))), 100) params['max_bin_by_feature'] = [2, 100] lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) @@ -1599,7 +1599,7 @@ def test_forced_bins(self): forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/regression/forced_bins.json') params = {'objective': 'regression_l1', - 'max_bin': 6, + 'max_bin': 5, 'forcedbins_filename': forcedbins_filename, 'num_leaves': 2, 'min_data_in_leaf': 1, @@ -1613,7 +1613,7 @@ def test_forced_bins(self): predicted = est.predict(new_x) self.assertEqual(len(np.unique(predicted)), 3) new_x[:, 0] = [0, 0, 0] - new_x[:, 1] = [-0.25, -0.5, -0.9] + new_x[:, 1] = [-0.9, -0.6, -0.3] predicted = est.predict(new_x) self.assertEqual(len(np.unique(predicted)), 1) params['forcedbins_filename'] = '' @@ -1621,3 +1621,28 @@ def test_forced_bins(self): est = lgb.train(params, lgb_x, num_boost_round=100) predicted = est.predict(new_x) self.assertEqual(len(np.unique(predicted)), 3) + + def test_binning_same_sign(self): + # test that binning works properly for features with only positive or only negative values + x = np.zeros((99, 2)) + x[:, 0] = np.arange(0.01, 1, 0.01) + x[:, 1] = -np.arange(0.01, 1, 0.01) + y = np.arange(0.01, 1, 0.01) + params = {'objective': 'regression_l1', + 'max_bin': 5, + 'num_leaves': 2, + 'min_data_in_leaf': 1, + 'verbose': -1, + 'seed': 0} + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + new_x = np.zeros((3, 2)) + new_x[:, 0] = [-1, 0, 1] + predicted = est.predict(new_x) + self.assertAlmostEqual(predicted[0], predicted[1]) + self.assertNotAlmostEqual(predicted[1], predicted[2]) + new_x = np.zeros((3, 2)) + new_x[:, 1] = [-1, 0, 1] + predicted = est.predict(new_x) + self.assertNotAlmostEqual(predicted[0], predicted[1]) + self.assertAlmostEqual(predicted[1], predicted[2]) From 9d22071dccedea825c862b02a65a1bef3fb8ce23 Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 13 Aug 2019 18:14:54 +1000 Subject: [PATCH 12/49] Add functionality to force bin thresholds. --- docs/Parameters.rst | 8 +++ include/LightGBM/bin.h | 3 +- include/LightGBM/config.h | 5 ++ include/LightGBM/dataset.h | 3 + src/io/bin.cpp | 88 +++++++++++++++++------- src/io/config_auto.cpp | 4 ++ src/io/dataset.cpp | 64 ++++++++++++++++- src/io/dataset_loader.cpp | 46 ++++++++++--- tests/data/forced_bins.json | 10 +++ tests/python_package_test/test_engine.py | 32 ++++++++- 10 files changed, 227 insertions(+), 36 deletions(-) create mode 100644 tests/data/forced_bins.json diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 8c16e190d223..10105bfbed5a 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -404,6 +404,14 @@ Learning Control Parameters - see `this file `__ as an example +- ``forcedbins_filename`` :raw-html:`🔗︎`, default = ``""``, type = string + + - path to a ``.json`` file that specifies bin upper bounds for some or all features + + - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + + - see `this file `__ as an example + - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` - decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 46baee58fc46..1c5f62cd1907 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -146,9 +146,10 @@ class BinMapper { * \param bin_type Type of this bin * \param use_missing True to enable missing value handle * \param zero_as_missing True to use zero as missing value + * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm) */ void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, - bool use_missing, bool zero_as_missing); + bool use_missing, bool zero_as_missing, std::vector forced_upper_bounds); /*! * \brief Use specific number of bin to calculate the size of this class diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 190e239cf5a7..d2a953ddb416 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -402,6 +402,11 @@ struct Config { // desc = see `this file `__ as an example std::string forcedsplits_filename = ""; + // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features + // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + // desc = see `this file `__ as an example + std::string forcedbins_filename = ""; + // check = >=0.0 // check = <=1.0 // desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index e688522fbb1a..900487eafbf4 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -596,6 +596,8 @@ class Dataset { void addFeaturesFrom(Dataset* other); + static std::vector> GetForcedBins(std::string forced_bins_path, int num_total_features); + private: std::string data_filename_; /*! \brief Store used features */ @@ -630,6 +632,7 @@ class Dataset { bool is_finish_load_; int max_bin_; std::vector max_bin_by_feature_; + std::vector> forced_bin_bounds_; int bin_construct_sample_cnt_; int min_data_in_bin_; bool use_missing_; diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 2e79a80266b6..62713d1bddd3 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -150,8 +150,10 @@ namespace LightGBM { } std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, - int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) { + int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector forced_upper_bounds) { std::vector bin_upper_bound; + + // get list of distinct values int left_cnt_data = 0; int cnt_zero = 0; int right_cnt_data = 0; @@ -165,6 +167,7 @@ namespace LightGBM { } } + // get number of positive and negative distinct values int left_cnt = -1; for (int i = 0; i < num_distinct_values; ++i) { if (distinct_values[i] > -kZeroThreshold) { @@ -172,20 +175,9 @@ namespace LightGBM { break; } } - if (left_cnt < 0) { left_cnt = num_distinct_values; } - - if ((left_cnt > 0) && (max_bin > 1)) { - int left_max_bin = static_cast(static_cast(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1)); - left_max_bin = std::max(1, left_max_bin); - bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin); - if (bin_upper_bound.size() > 0) { - bin_upper_bound.back() = -kZeroThreshold; - } - } - int right_start = -1; for (int i = left_cnt; i < num_distinct_values; ++i) { if (distinct_values[i] > kZeroThreshold) { @@ -194,21 +186,66 @@ namespace LightGBM { } } - int right_max_bin = max_bin - 1 - static_cast(bin_upper_bound.size()); - if (right_start >= 0 && right_max_bin > 0) { - auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start, - num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin); + // include zero bounds if possible + if (max_bin == 2) { + if (left_cnt == 0) { + bin_upper_bound.push_back(kZeroThreshold); + } else { + bin_upper_bound.push_back(-kZeroThreshold); + } + } else if (max_bin >= 3) { + bin_upper_bound.push_back(-kZeroThreshold); bin_upper_bound.push_back(kZeroThreshold); - bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end()); - } else { - bin_upper_bound.push_back(std::numeric_limits::infinity()); } - CHECK(bin_upper_bound.size() <= static_cast(max_bin)); + + // add forced bounds, excluding zeros since we have already added zero bounds + int i = 0; + while (i < forced_upper_bounds.size()) { + if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) { + forced_upper_bounds.erase(forced_upper_bounds.begin() + i); + } else { + ++i; + } + } + bin_upper_bound.push_back(std::numeric_limits::infinity()); + int max_to_insert = max_bin - static_cast(bin_upper_bound.size()); + int num_to_insert = std::min(max_to_insert, static_cast(forced_upper_bounds.size())); + if (num_to_insert > 0) { + bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); + } + std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + + // find remaining bounds + std::vector bounds_to_add; + int value_ind = 0; + for (int i = 0; i < bin_upper_bound.size(); ++i) { + int cnt_in_bin = 0; + int distinct_cnt_in_bin = 0; + int bin_start = value_ind; + while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) { + cnt_in_bin += counts[value_ind]; + ++distinct_cnt_in_bin; + ++value_ind; + } + int bins_remaining = max_bin - static_cast(bin_upper_bound.size()) - static_cast(bounds_to_add.size()); + int num_sub_bins = static_cast(std::lround((static_cast(cnt_in_bin) * bins_remaining / total_sample_cnt))); + num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1; + if (i == bin_upper_bound.size() - 1) { + num_sub_bins = bins_remaining + 1; + } + std::vector new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, + num_sub_bins, cnt_in_bin, min_data_in_bin); + bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity + } + bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); + std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt, - int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) { + int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, + std::vector forced_upper_bounds) { int na_cnt = 0; int tmp_num_sample_values = 0; for (int i = 0; i < num_sample_values; ++i) { @@ -276,14 +313,17 @@ namespace LightGBM { int num_distinct_values = static_cast(distinct_values.size()); if (bin_type_ == BinType::NumericalBin) { if (missing_type_ == MissingType::Zero) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); if (bin_upper_bound_.size() == 2) { missing_type_ = MissingType::None; } } else if (missing_type_ == MissingType::None) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); } else { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, + min_data_in_bin, forced_upper_bounds); bin_upper_bound_.push_back(NaN); } num_bin_ = static_cast(bin_upper_bound_.size()); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 8d75b1cde3df..ad5b43811ebe 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -211,6 +211,7 @@ std::unordered_set Config::parameter_set({ "monotone_constraints", "feature_contri", "forcedsplits_filename", + "forcedbins_filename", "refit_decay_rate", "cegb_tradeoff", "cegb_penalty_split", @@ -396,6 +397,8 @@ void Config::GetMembersFromString(const std::unordered_map=0.0); CHECK(refit_decay_rate <=1.0); @@ -608,6 +611,7 @@ std::string Config::SaveMembersToString() const { str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast(monotone_constraints), ",") << "]\n"; str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n"; str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n"; + str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n"; str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n"; str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n"; str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n"; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index f201a40a1a7a..c931e945cd24 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -8,12 +8,17 @@ #include #include #include +#include #include #include #include #include #include +#include + +using namespace json11; + namespace LightGBM { @@ -324,6 +329,7 @@ void Dataset::Construct( max_bin_by_feature_.resize(num_total_features_); max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end()); } + forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_); max_bin_ = io_config.max_bin; min_data_in_bin_ = io_config.min_data_in_bin; bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt; @@ -356,6 +362,12 @@ void Dataset::ResetConfig(const char* parameters) { if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) { Log::Warning("Cannot change sparse_threshold after constructed Dataset handle."); } + if (param.count("forcedbins_filename")) { + std::vector> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_); + if (config_bounds != forced_bin_bounds_) { + Log::Warning("Cannot change forced bins after constructed Dataset handle."); + } + } if (!io_config.monotone_constraints.empty()) { CHECK(static_cast(num_total_features_) == io_config.monotone_constraints.size()); @@ -657,6 +669,10 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { for (int i = 0; i < num_total_features_; ++i) { size_of_header += feature_names_[i].size() + sizeof(int); } + // size of forced bins + for (int i = 0; i < num_total_features_; ++i) { + size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int); + } writer->Write(&size_of_header, sizeof(size_of_header)); // write header writer->Write(&num_data_, sizeof(num_data_)); @@ -705,6 +721,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { const char* c_str = feature_names_[i].c_str(); writer->Write(c_str, sizeof(char) * str_len); } + // write forced bins + for (int i = 0; i < num_total_features_; ++i) { + int num_bounds = static_cast(forced_bin_bounds_[i].size()); + writer->Write(&num_bounds, sizeof(int)); + + for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) { + writer->Write(&forced_bin_bounds_[i][j], sizeof(double)); + } + } // get size of meta data size_t size_of_metadata = metadata_.SizesInByte(); @@ -754,6 +779,13 @@ void Dataset::DumpTextFile(const char* text_filename) { for (auto n : feature_names_) { fprintf(file, "%s, ", n.c_str()); } + fprintf(file, "\nforced_bins: "); + for (int i = 0; i < num_total_features_; ++i) { + fprintf(file, "\nfeature %d: ", i); + for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) { + fprintf(file, "%lf, ", forced_bin_bounds_[i][j]); + } + } std::vector> iterators; iterators.reserve(num_features_); for (int j = 0; j < num_features_; ++j) { @@ -1005,6 +1037,7 @@ void Dataset::addFeaturesFrom(Dataset* other) { PushVector(feature_names_, other->feature_names_); PushVector(feature2subfeature_, other->feature2subfeature_); PushVector(group_feature_cnt_, other->group_feature_cnt_); + PushVector(forced_bin_bounds_, other->forced_bin_bounds_); feature_groups_.reserve(other->feature_groups_.size()); for (auto& fg : other->feature_groups_) { feature_groups_.emplace_back(new FeatureGroup(*fg)); @@ -1027,10 +1060,39 @@ void Dataset::addFeaturesFrom(Dataset* other) { PushClearIfEmpty(monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0); PushClearIfEmpty(feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0); - + PushClearIfEmpty(max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1); num_features_ += other->num_features_; num_total_features_ += other->num_total_features_; num_groups_ += other->num_groups_; } + +std::vector> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features) { + std::vector> forced_bins(num_total_features, std::vector()); + if (forced_bins_path != "") { + std::ifstream forced_bins_stream(forced_bins_path.c_str()); + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); + } + } + return forced_bins; +} + + } // namespace LightGBM diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 1130d803ea36..f36d5b1df27d 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -3,7 +3,6 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #include - #include #include #include @@ -458,6 +457,21 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b } dataset->feature_names_.emplace_back(str_buf.str()); } + // get forced_bin_bounds_ + dataset->forced_bin_bounds_ = std::vector>(dataset->num_total_features_, std::vector()); + for (int i = 0; i < dataset->num_total_features_; ++i) { + int num_bounds = *(reinterpret_cast(mem_ptr)); + mem_ptr += sizeof(int); + dataset->forced_bin_bounds_[i] = std::vector(); + const double* tmp_ptr_forced_bounds = reinterpret_cast(mem_ptr); + + for (int j = 0; j < num_bounds; ++j) { + double bound = tmp_ptr_forced_bounds[j]; + dataset->forced_bin_bounds_[i].push_back(bound); + } + mem_ptr += num_bounds * sizeof(double); + + } // read size of meta data read_cnt = reader->Read(buffer.data(), sizeof(size_t)); @@ -549,6 +563,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b return dataset.release(); } + Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, int** sample_indices, int num_col, const int* num_per_col, size_t total_sample_size, data_size_t num_data) { @@ -565,6 +580,11 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, CHECK(static_cast(num_col) == config_.max_bin_by_feature.size()); CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1); } + + // get forced split + std::string forced_bins_path = config_.forcedbins_filename; + std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col); + const data_size_t filter_cnt = static_cast( static_cast(config_.min_data_in_leaf * total_sample_size) / num_data); if (Network::num_machines() == 1) { @@ -585,12 +605,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size, config_.max_bin, config_.min_data_in_bin, filter_cnt, - bin_type, config_.use_missing, config_.zero_as_missing); + bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size, config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -630,12 +651,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i], total_sample_size, config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i], total_sample_size, config_.max_bin_by_feature[start[rank] + i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -872,6 +894,10 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1); } + // get forced split + std::string forced_bins_path = config_.forcedbins_filename; + std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_); + // check the range of label_idx, weight_idx and group_idx CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_); CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_); @@ -909,12 +935,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[i].data(), static_cast(sample_values[i].size()), sample_data.size(), config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[i].data(), static_cast(sample_values[i].size()), sample_data.size(), config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -955,13 +982,14 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(), static_cast(sample_values[start[rank] + i].size()), sample_data.size(), config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(), static_cast(sample_values[start[rank] + i].size()), sample_data.size(), config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, - config_.use_missing, config_.zero_as_missing); + config_.use_missing, config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json new file mode 100644 index 000000000000..aa74c36ffb78 --- /dev/null +++ b/tests/data/forced_bins.json @@ -0,0 +1,10 @@ +[ + { + "feature": 0, + "bin_upper_bound": [ 0.3, 0.35, 0.4 ] + }, + { + "feature": 1, + "bin_upper_bound": [ -0.1, -0.15, -0.2 ] + } +] \ No newline at end of file diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 9a34de869724..4eb1e2cb8e38 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -921,7 +921,7 @@ def test_max_bin_by_feature(self): } lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) - self.assertEqual(len(np.unique(est.predict(X))), 100) + self.assertEqual(len(np.unique(est.predict(X))), 99) params['max_bin_by_feature'] = [2, 100] lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) @@ -1590,3 +1590,33 @@ def constant_metric(preds, train_data): decreasing_metric(preds, train_data)], early_stopping_rounds=5, verbose_eval=False) self.assertEqual(gbm.best_iteration, 1) + + def test_forced_bins(self): + x = np.zeros((100, 2)) + x[:, 0] = np.arange(0, 1, 0.01) + x[:, 1] = -np.arange(0, 1, 0.01) + y = np.arange(0, 1, 0.01) + forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json') + params = {'objective': 'regression_l1', + 'max_bin': 6, + 'forcedbins_filename': forcedbins_filename, + 'num_leaves': 2, + 'min_data_in_leaf': 1, + 'verbose': -1, + 'seed': 0} + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + new_x = np.zeros((3, x.shape[1])) + new_x[:, 0] = [0.31, 0.37, 0.41] + new_x[:, 1] = [0, 0, 0] + predicted = est.predict(new_x) + self.assertEqual(len(np.unique(predicted)), 3) + new_x[:, 0] = [0, 0, 0] + new_x[:, 1] = [-0.25, -0.5, -0.9] + predicted = est.predict(new_x) + self.assertEqual(len(np.unique(predicted)), 1) + params['forcedbins_filename'] = '' + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + predicted = est.predict(new_x) + self.assertEqual(len(np.unique(predicted)), 3) From 3178609187909da91dd3f5a6dd4e67fc0de30065 Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:10:21 +1000 Subject: [PATCH 13/49] Fix style issues. --- docs/Parameters.rst | 4 +- .../regression}/forced_bins.json | 2 +- examples/regression/train.conf | 3 ++ include/LightGBM/config.h | 4 +- src/io/bin.cpp | 2 +- src/io/dataset.cpp | 40 ++++++++++--------- src/io/dataset_loader.cpp | 1 + tests/python_package_test/test_engine.py | 3 +- 8 files changed, 34 insertions(+), 25 deletions(-) rename {tests/data => examples/regression}/forced_bins.json (98%) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 10105bfbed5a..c4f45f0010c4 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -408,9 +408,9 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) - - see `this file `__ as an example + - see `this file `__ as an example - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` diff --git a/tests/data/forced_bins.json b/examples/regression/forced_bins.json similarity index 98% rename from tests/data/forced_bins.json rename to examples/regression/forced_bins.json index aa74c36ffb78..1ee0a49d727c 100644 --- a/tests/data/forced_bins.json +++ b/examples/regression/forced_bins.json @@ -7,4 +7,4 @@ "feature": 1, "bin_upper_bound": [ -0.1, -0.15, -0.2 ] } -] \ No newline at end of file +] diff --git a/examples/regression/train.conf b/examples/regression/train.conf index 11396c23ecc2..4c73169dc8f9 100644 --- a/examples/regression/train.conf +++ b/examples/regression/train.conf @@ -29,6 +29,9 @@ is_training_metric = true # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. max_bin = 255 +# forced bin thresholds +# forcedbins_filename = forced_bins.json + # training data # if exsting weight file, should name to "regression.train.weight" # alias: train_data, train diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index d2a953ddb416..56903a9b96ae 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -403,8 +403,8 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) - // desc = see `this file `__ as an example + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = see `this file `__ as an example std::string forcedbins_filename = ""; // check = >=0.0 diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 62713d1bddd3..2556a59b4715 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -320,7 +320,7 @@ namespace LightGBM { } } else if (missing_type_ == MissingType::None) { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); + min_data_in_bin, forced_upper_bounds); } else { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin, forced_upper_bounds); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index c931e945cd24..269c06c4c37d 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -5,10 +5,10 @@ #include #include +#include #include #include #include -#include #include #include @@ -1071,24 +1071,28 @@ std::vector> Dataset::GetForcedBins(std::string forced_bins_ std::vector> forced_bins(num_total_features, std::vector()); if (forced_bins_path != "") { std::ifstream forced_bins_stream(forced_bins_path.c_str()); - std::stringstream buffer; - buffer << forced_bins_stream.rdbuf(); - std::string err; - Json forced_bins_json = Json::parse(buffer.str(), err); - CHECK(forced_bins_json.is_array()); - std::vector forced_bins_arr = forced_bins_json.array_items(); - for (int i = 0; i < forced_bins_arr.size(); ++i) { - int feature_num = forced_bins_arr[i]["feature"].int_value(); - CHECK(feature_num < num_total_features); - std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); - for (int j = 0; j < bounds_arr.size(); ++j) { - forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + if (forced_bins_stream.fail()) { + Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str()); + } else { + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); } - } - // remove duplicates - for (int i = 0; i < num_total_features; ++i) { - auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); - forced_bins[i].erase(new_end, forced_bins[i].end()); } } return forced_bins; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index f36d5b1df27d..eb83d74bfe3d 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,6 +2,7 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + #include #include #include diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4eb1e2cb8e38..2420ee9ec853 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1596,7 +1596,8 @@ def test_forced_bins(self): x[:, 0] = np.arange(0, 1, 0.01) x[:, 1] = -np.arange(0, 1, 0.01) y = np.arange(0, 1, 0.01) - forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json') + forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../../examples/regression/forced_bins.json') params = {'objective': 'regression_l1', 'max_bin': 6, 'forcedbins_filename': forcedbins_filename, From 934b305422966ed6c8348e46290a13078544d7fb Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:19:58 +1000 Subject: [PATCH 14/49] Use stable sort. --- src/io/bin.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 2556a59b4715..b26a6a461e3e 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -213,7 +213,7 @@ namespace LightGBM { if (num_to_insert > 0) { bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); } - std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); // find remaining bounds std::vector bounds_to_add; @@ -238,7 +238,7 @@ namespace LightGBM { bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); - std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } From dc45bd1d10b508101133229696ec10be0e271ede Mon Sep 17 00:00:00 2001 From: btrotta Date: Thu, 15 Aug 2019 19:17:19 +1000 Subject: [PATCH 15/49] Minor style and doc fixes. --- docs/Parameters.rst | 2 +- include/LightGBM/config.h | 2 +- src/io/dataset_loader.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index c4f45f0010c4..28777637d100 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -408,7 +408,7 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) - see `this file `__ as an example diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 56903a9b96ae..b67ee9656468 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -403,7 +403,7 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) // desc = see `this file `__ as an example std::string forcedbins_filename = ""; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index eb83d74bfe3d..c00b9b7fdae5 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,8 +2,8 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - #include + #include #include #include From 018182ceccdd02ef930f20e63284be2cadc1cb14 Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 13 Aug 2019 18:14:54 +1000 Subject: [PATCH 16/49] Add functionality to force bin thresholds. --- docs/Parameters.rst | 4 +-- include/LightGBM/config.h | 4 +-- src/io/bin.cpp | 6 ++-- src/io/dataset.cpp | 39 +++++++++++------------- src/io/dataset_loader.cpp | 1 - tests/data/forced_bins.json | 10 ++++++ tests/python_package_test/test_engine.py | 3 +- 7 files changed, 36 insertions(+), 31 deletions(-) create mode 100644 tests/data/forced_bins.json diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 28777637d100..10105bfbed5a 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -408,9 +408,9 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) + - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) - - see `this file `__ as an example + - see `this file `__ as an example - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index b67ee9656468..d2a953ddb416 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -403,8 +403,8 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) - // desc = see `this file `__ as an example + // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + // desc = see `this file `__ as an example std::string forcedbins_filename = ""; // check = >=0.0 diff --git a/src/io/bin.cpp b/src/io/bin.cpp index b26a6a461e3e..62713d1bddd3 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -213,7 +213,7 @@ namespace LightGBM { if (num_to_insert > 0) { bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); } - std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); // find remaining bounds std::vector bounds_to_add; @@ -238,7 +238,7 @@ namespace LightGBM { bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); - std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } @@ -320,7 +320,7 @@ namespace LightGBM { } } else if (missing_type_ == MissingType::None) { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); + min_data_in_bin, forced_upper_bounds); } else { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin, forced_upper_bounds); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 269c06c4c37d..e948754034be 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -1071,29 +1072,25 @@ std::vector> Dataset::GetForcedBins(std::string forced_bins_ std::vector> forced_bins(num_total_features, std::vector()); if (forced_bins_path != "") { std::ifstream forced_bins_stream(forced_bins_path.c_str()); - if (forced_bins_stream.fail()) { - Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str()); - } else { - std::stringstream buffer; - buffer << forced_bins_stream.rdbuf(); - std::string err; - Json forced_bins_json = Json::parse(buffer.str(), err); - CHECK(forced_bins_json.is_array()); - std::vector forced_bins_arr = forced_bins_json.array_items(); - for (int i = 0; i < forced_bins_arr.size(); ++i) { - int feature_num = forced_bins_arr[i]["feature"].int_value(); - CHECK(feature_num < num_total_features); - std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); - for (int j = 0; j < bounds_arr.size(); ++j) { - forced_bins[feature_num].push_back(bounds_arr[j].number_value()); - } - } - // remove duplicates - for (int i = 0; i < num_total_features; ++i) { - auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); - forced_bins[i].erase(new_end, forced_bins[i].end()); + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); } } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); + } } return forced_bins; } diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index c00b9b7fdae5..f36d5b1df27d 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -3,7 +3,6 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #include - #include #include #include diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json new file mode 100644 index 000000000000..aa74c36ffb78 --- /dev/null +++ b/tests/data/forced_bins.json @@ -0,0 +1,10 @@ +[ + { + "feature": 0, + "bin_upper_bound": [ 0.3, 0.35, 0.4 ] + }, + { + "feature": 1, + "bin_upper_bound": [ -0.1, -0.15, -0.2 ] + } +] \ No newline at end of file diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 2420ee9ec853..4eb1e2cb8e38 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1596,8 +1596,7 @@ def test_forced_bins(self): x[:, 0] = np.arange(0, 1, 0.01) x[:, 1] = -np.arange(0, 1, 0.01) y = np.arange(0, 1, 0.01) - forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../../examples/regression/forced_bins.json') + forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json') params = {'objective': 'regression_l1', 'max_bin': 6, 'forcedbins_filename': forcedbins_filename, From 7a4df5117deca7c7ff457b07a6cbbc3807103d4c Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:10:21 +1000 Subject: [PATCH 17/49] Fix style issues. --- docs/Parameters.rst | 4 +-- include/LightGBM/config.h | 4 +-- src/io/bin.cpp | 2 +- src/io/dataset.cpp | 39 +++++++++++++----------- src/io/dataset_loader.cpp | 1 + tests/data/forced_bins.json | 10 ------ tests/python_package_test/test_engine.py | 3 +- 7 files changed, 29 insertions(+), 34 deletions(-) delete mode 100644 tests/data/forced_bins.json diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 10105bfbed5a..c4f45f0010c4 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -408,9 +408,9 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) - - see `this file `__ as an example + - see `this file `__ as an example - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index d2a953ddb416..56903a9b96ae 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -403,8 +403,8 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) - // desc = see `this file `__ as an example + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = see `this file `__ as an example std::string forcedbins_filename = ""; // check = >=0.0 diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 62713d1bddd3..2556a59b4715 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -320,7 +320,7 @@ namespace LightGBM { } } else if (missing_type_ == MissingType::None) { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); + min_data_in_bin, forced_upper_bounds); } else { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin, forced_upper_bounds); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index e948754034be..269c06c4c37d 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -1072,24 +1071,28 @@ std::vector> Dataset::GetForcedBins(std::string forced_bins_ std::vector> forced_bins(num_total_features, std::vector()); if (forced_bins_path != "") { std::ifstream forced_bins_stream(forced_bins_path.c_str()); - std::stringstream buffer; - buffer << forced_bins_stream.rdbuf(); - std::string err; - Json forced_bins_json = Json::parse(buffer.str(), err); - CHECK(forced_bins_json.is_array()); - std::vector forced_bins_arr = forced_bins_json.array_items(); - for (int i = 0; i < forced_bins_arr.size(); ++i) { - int feature_num = forced_bins_arr[i]["feature"].int_value(); - CHECK(feature_num < num_total_features); - std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); - for (int j = 0; j < bounds_arr.size(); ++j) { - forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + if (forced_bins_stream.fail()) { + Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str()); + } else { + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); } - } - // remove duplicates - for (int i = 0; i < num_total_features; ++i) { - auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); - forced_bins[i].erase(new_end, forced_bins[i].end()); } } return forced_bins; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index f36d5b1df27d..eb83d74bfe3d 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,6 +2,7 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + #include #include #include diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json deleted file mode 100644 index aa74c36ffb78..000000000000 --- a/tests/data/forced_bins.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "feature": 0, - "bin_upper_bound": [ 0.3, 0.35, 0.4 ] - }, - { - "feature": 1, - "bin_upper_bound": [ -0.1, -0.15, -0.2 ] - } -] \ No newline at end of file diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4eb1e2cb8e38..2420ee9ec853 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1596,7 +1596,8 @@ def test_forced_bins(self): x[:, 0] = np.arange(0, 1, 0.01) x[:, 1] = -np.arange(0, 1, 0.01) y = np.arange(0, 1, 0.01) - forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json') + forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../../examples/regression/forced_bins.json') params = {'objective': 'regression_l1', 'max_bin': 6, 'forcedbins_filename': forcedbins_filename, From 6095148a9a2a18690d096038250dae4c2cc5c183 Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:19:58 +1000 Subject: [PATCH 18/49] Use stable sort. --- src/io/bin.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 2556a59b4715..b26a6a461e3e 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -213,7 +213,7 @@ namespace LightGBM { if (num_to_insert > 0) { bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); } - std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); // find remaining bounds std::vector bounds_to_add; @@ -238,7 +238,7 @@ namespace LightGBM { bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); - std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } From 8b57a56b65b5b1cc8b062145bc6380db7d73c678 Mon Sep 17 00:00:00 2001 From: btrotta Date: Thu, 15 Aug 2019 19:17:19 +1000 Subject: [PATCH 19/49] Minor style and doc fixes. --- docs/Parameters.rst | 2 +- include/LightGBM/config.h | 2 +- src/io/dataset_loader.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index c4f45f0010c4..28777637d100 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -408,7 +408,7 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) - see `this file `__ as an example diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 56903a9b96ae..b67ee9656468 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -403,7 +403,7 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) // desc = see `this file `__ as an example std::string forcedbins_filename = ""; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index eb83d74bfe3d..c00b9b7fdae5 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,8 +2,8 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - #include + #include #include #include From de83a69e65802bc64c280cdc55ec6503025b3a1f Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 20 Aug 2019 21:26:06 +1000 Subject: [PATCH 20/49] Change binning behavior to be same as PR #2342. --- src/io/bin.cpp | 14 +++++++---- tests/python_package_test/test_engine.py | 31 +++++++++++++++++++++--- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index b26a6a461e3e..40da30c6ad2d 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -186,7 +186,7 @@ namespace LightGBM { } } - // include zero bounds if possible + // include zero bounds and infinity bound if (max_bin == 2) { if (left_cnt == 0) { bin_upper_bound.push_back(kZeroThreshold); @@ -194,9 +194,14 @@ namespace LightGBM { bin_upper_bound.push_back(-kZeroThreshold); } } else if (max_bin >= 3) { - bin_upper_bound.push_back(-kZeroThreshold); - bin_upper_bound.push_back(kZeroThreshold); + if (left_cnt > 0) { + bin_upper_bound.push_back(-kZeroThreshold); + } + if (right_start >= 0) { + bin_upper_bound.push_back(kZeroThreshold); + } } + bin_upper_bound.push_back(std::numeric_limits::infinity()); // add forced bounds, excluding zeros since we have already added zero bounds int i = 0; @@ -207,7 +212,6 @@ namespace LightGBM { ++i; } } - bin_upper_bound.push_back(std::numeric_limits::infinity()); int max_to_insert = max_bin - static_cast(bin_upper_bound.size()); int num_to_insert = std::min(max_to_insert, static_cast(forced_upper_bounds.size())); if (num_to_insert > 0) { @@ -239,7 +243,7 @@ namespace LightGBM { } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); - CHECK(bin_upper_bound.size() <= max_bin); + CHECK(bin_upper_bound.size() <= static_cast(max_bin)); return bin_upper_bound; } diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 2420ee9ec853..9f807d64b102 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -921,7 +921,7 @@ def test_max_bin_by_feature(self): } lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) - self.assertEqual(len(np.unique(est.predict(X))), 99) + self.assertEqual(len(np.unique(est.predict(X))), 100) params['max_bin_by_feature'] = [2, 100] lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) @@ -1599,7 +1599,7 @@ def test_forced_bins(self): forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/regression/forced_bins.json') params = {'objective': 'regression_l1', - 'max_bin': 6, + 'max_bin': 5, 'forcedbins_filename': forcedbins_filename, 'num_leaves': 2, 'min_data_in_leaf': 1, @@ -1613,7 +1613,7 @@ def test_forced_bins(self): predicted = est.predict(new_x) self.assertEqual(len(np.unique(predicted)), 3) new_x[:, 0] = [0, 0, 0] - new_x[:, 1] = [-0.25, -0.5, -0.9] + new_x[:, 1] = [-0.9, -0.6, -0.3] predicted = est.predict(new_x) self.assertEqual(len(np.unique(predicted)), 1) params['forcedbins_filename'] = '' @@ -1621,3 +1621,28 @@ def test_forced_bins(self): est = lgb.train(params, lgb_x, num_boost_round=100) predicted = est.predict(new_x) self.assertEqual(len(np.unique(predicted)), 3) + + def test_binning_same_sign(self): + # test that binning works properly for features with only positive or only negative values + x = np.zeros((99, 2)) + x[:, 0] = np.arange(0.01, 1, 0.01) + x[:, 1] = -np.arange(0.01, 1, 0.01) + y = np.arange(0.01, 1, 0.01) + params = {'objective': 'regression_l1', + 'max_bin': 5, + 'num_leaves': 2, + 'min_data_in_leaf': 1, + 'verbose': -1, + 'seed': 0} + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + new_x = np.zeros((3, 2)) + new_x[:, 0] = [-1, 0, 1] + predicted = est.predict(new_x) + self.assertAlmostEqual(predicted[0], predicted[1]) + self.assertNotAlmostEqual(predicted[1], predicted[2]) + new_x = np.zeros((3, 2)) + new_x[:, 1] = [-1, 0, 1] + predicted = est.predict(new_x) + self.assertNotAlmostEqual(predicted[0], predicted[1]) + self.assertAlmostEqual(predicted[1], predicted[2]) From c4787757b33c5ecec6d574b1a7b2ef133c0d4f89 Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 13 Aug 2019 18:14:54 +1000 Subject: [PATCH 21/49] Add functionality to force bin thresholds. --- docs/Parameters.rst | 8 +++ include/LightGBM/bin.h | 3 +- include/LightGBM/config.h | 5 ++ include/LightGBM/dataset.h | 3 ++ src/io/bin.cpp | 21 ++++---- src/io/config_auto.cpp | 4 ++ src/io/dataset.cpp | 64 +++++++++++++++++++++++- src/io/dataset_loader.cpp | 46 +++++++++++++---- tests/data/forced_bins.json | 10 ++++ tests/python_package_test/test_engine.py | 2 +- 10 files changed, 145 insertions(+), 21 deletions(-) create mode 100644 tests/data/forced_bins.json diff --git a/docs/Parameters.rst b/docs/Parameters.rst index aaa10eef347b..1fd11c94bd73 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -412,6 +412,14 @@ Learning Control Parameters - see `this file `__ as an example +- ``forcedbins_filename`` :raw-html:`🔗︎`, default = ``""``, type = string + + - path to a ``.json`` file that specifies bin upper bounds for some or all features + + - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + + - see `this file `__ as an example + - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` - decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 46baee58fc46..1c5f62cd1907 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -146,9 +146,10 @@ class BinMapper { * \param bin_type Type of this bin * \param use_missing True to enable missing value handle * \param zero_as_missing True to use zero as missing value + * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm) */ void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, - bool use_missing, bool zero_as_missing); + bool use_missing, bool zero_as_missing, std::vector forced_upper_bounds); /*! * \brief Use specific number of bin to calculate the size of this class diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 3e1a6c4f0bd6..049b0bf1f8df 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -408,6 +408,11 @@ struct Config { // desc = see `this file `__ as an example std::string forcedsplits_filename = ""; + // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features + // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + // desc = see `this file `__ as an example + std::string forcedbins_filename = ""; + // check = >=0.0 // check = <=1.0 // desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index e688522fbb1a..900487eafbf4 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -596,6 +596,8 @@ class Dataset { void addFeaturesFrom(Dataset* other); + static std::vector> GetForcedBins(std::string forced_bins_path, int num_total_features); + private: std::string data_filename_; /*! \brief Store used features */ @@ -630,6 +632,7 @@ class Dataset { bool is_finish_load_; int max_bin_; std::vector max_bin_by_feature_; + std::vector> forced_bin_bounds_; int bin_construct_sample_cnt_; int min_data_in_bin_; bool use_missing_; diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 2e79a80266b6..5c41edaad9b2 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -150,8 +150,10 @@ namespace LightGBM { } std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, - int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) { + int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector forced_upper_bounds) { std::vector bin_upper_bound; + + // get list of distinct values int left_cnt_data = 0; int cnt_zero = 0; int right_cnt_data = 0; @@ -165,6 +167,7 @@ namespace LightGBM { } } + // get number of positive and negative distinct values int left_cnt = -1; for (int i = 0; i < num_distinct_values; ++i) { if (distinct_values[i] > -kZeroThreshold) { @@ -172,7 +175,6 @@ namespace LightGBM { break; } } - if (left_cnt < 0) { left_cnt = num_distinct_values; } @@ -199,16 +201,14 @@ namespace LightGBM { auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start, num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin); bin_upper_bound.push_back(kZeroThreshold); - bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end()); - } else { - bin_upper_bound.push_back(std::numeric_limits::infinity()); } CHECK(bin_upper_bound.size() <= static_cast(max_bin)); return bin_upper_bound; } void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt, - int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) { + int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, + std::vector forced_upper_bounds) { int na_cnt = 0; int tmp_num_sample_values = 0; for (int i = 0; i < num_sample_values; ++i) { @@ -276,14 +276,17 @@ namespace LightGBM { int num_distinct_values = static_cast(distinct_values.size()); if (bin_type_ == BinType::NumericalBin) { if (missing_type_ == MissingType::Zero) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); if (bin_upper_bound_.size() == 2) { missing_type_ = MissingType::None; } } else if (missing_type_ == MissingType::None) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); } else { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, + min_data_in_bin, forced_upper_bounds); bin_upper_bound_.push_back(NaN); } num_bin_ = static_cast(bin_upper_bound_.size()); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index b2957cb6335b..aaafe6d4507c 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -214,6 +214,7 @@ std::unordered_set Config::parameter_set({ "monotone_constraints", "feature_contri", "forcedsplits_filename", + "forcedbins_filename", "refit_decay_rate", "cegb_tradeoff", "cegb_penalty_split", @@ -402,6 +403,8 @@ void Config::GetMembersFromString(const std::unordered_map=0.0); CHECK(refit_decay_rate <=1.0); @@ -617,6 +620,7 @@ std::string Config::SaveMembersToString() const { str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast(monotone_constraints), ",") << "]\n"; str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n"; str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n"; + str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n"; str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n"; str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n"; str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n"; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index f201a40a1a7a..c931e945cd24 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -8,12 +8,17 @@ #include #include #include +#include #include #include #include #include #include +#include + +using namespace json11; + namespace LightGBM { @@ -324,6 +329,7 @@ void Dataset::Construct( max_bin_by_feature_.resize(num_total_features_); max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end()); } + forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_); max_bin_ = io_config.max_bin; min_data_in_bin_ = io_config.min_data_in_bin; bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt; @@ -356,6 +362,12 @@ void Dataset::ResetConfig(const char* parameters) { if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) { Log::Warning("Cannot change sparse_threshold after constructed Dataset handle."); } + if (param.count("forcedbins_filename")) { + std::vector> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_); + if (config_bounds != forced_bin_bounds_) { + Log::Warning("Cannot change forced bins after constructed Dataset handle."); + } + } if (!io_config.monotone_constraints.empty()) { CHECK(static_cast(num_total_features_) == io_config.monotone_constraints.size()); @@ -657,6 +669,10 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { for (int i = 0; i < num_total_features_; ++i) { size_of_header += feature_names_[i].size() + sizeof(int); } + // size of forced bins + for (int i = 0; i < num_total_features_; ++i) { + size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int); + } writer->Write(&size_of_header, sizeof(size_of_header)); // write header writer->Write(&num_data_, sizeof(num_data_)); @@ -705,6 +721,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { const char* c_str = feature_names_[i].c_str(); writer->Write(c_str, sizeof(char) * str_len); } + // write forced bins + for (int i = 0; i < num_total_features_; ++i) { + int num_bounds = static_cast(forced_bin_bounds_[i].size()); + writer->Write(&num_bounds, sizeof(int)); + + for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) { + writer->Write(&forced_bin_bounds_[i][j], sizeof(double)); + } + } // get size of meta data size_t size_of_metadata = metadata_.SizesInByte(); @@ -754,6 +779,13 @@ void Dataset::DumpTextFile(const char* text_filename) { for (auto n : feature_names_) { fprintf(file, "%s, ", n.c_str()); } + fprintf(file, "\nforced_bins: "); + for (int i = 0; i < num_total_features_; ++i) { + fprintf(file, "\nfeature %d: ", i); + for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) { + fprintf(file, "%lf, ", forced_bin_bounds_[i][j]); + } + } std::vector> iterators; iterators.reserve(num_features_); for (int j = 0; j < num_features_; ++j) { @@ -1005,6 +1037,7 @@ void Dataset::addFeaturesFrom(Dataset* other) { PushVector(feature_names_, other->feature_names_); PushVector(feature2subfeature_, other->feature2subfeature_); PushVector(group_feature_cnt_, other->group_feature_cnt_); + PushVector(forced_bin_bounds_, other->forced_bin_bounds_); feature_groups_.reserve(other->feature_groups_.size()); for (auto& fg : other->feature_groups_) { feature_groups_.emplace_back(new FeatureGroup(*fg)); @@ -1027,10 +1060,39 @@ void Dataset::addFeaturesFrom(Dataset* other) { PushClearIfEmpty(monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0); PushClearIfEmpty(feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0); - + PushClearIfEmpty(max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1); num_features_ += other->num_features_; num_total_features_ += other->num_total_features_; num_groups_ += other->num_groups_; } + +std::vector> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features) { + std::vector> forced_bins(num_total_features, std::vector()); + if (forced_bins_path != "") { + std::ifstream forced_bins_stream(forced_bins_path.c_str()); + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); + } + } + return forced_bins; +} + + } // namespace LightGBM diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index ee47bece8fa5..bdfe3b0b4dfc 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -3,7 +3,6 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #include - #include #include #include @@ -458,6 +457,21 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b } dataset->feature_names_.emplace_back(str_buf.str()); } + // get forced_bin_bounds_ + dataset->forced_bin_bounds_ = std::vector>(dataset->num_total_features_, std::vector()); + for (int i = 0; i < dataset->num_total_features_; ++i) { + int num_bounds = *(reinterpret_cast(mem_ptr)); + mem_ptr += sizeof(int); + dataset->forced_bin_bounds_[i] = std::vector(); + const double* tmp_ptr_forced_bounds = reinterpret_cast(mem_ptr); + + for (int j = 0; j < num_bounds; ++j) { + double bound = tmp_ptr_forced_bounds[j]; + dataset->forced_bin_bounds_[i].push_back(bound); + } + mem_ptr += num_bounds * sizeof(double); + + } // read size of meta data read_cnt = reader->Read(buffer.data(), sizeof(size_t)); @@ -549,6 +563,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b return dataset.release(); } + Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, int** sample_indices, int num_col, const int* num_per_col, size_t total_sample_size, data_size_t num_data) { @@ -565,6 +580,11 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, CHECK(static_cast(num_col) == config_.max_bin_by_feature.size()); CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1); } + + // get forced split + std::string forced_bins_path = config_.forcedbins_filename; + std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col); + const data_size_t filter_cnt = static_cast( static_cast(config_.min_data_in_leaf * total_sample_size) / num_data); if (Network::num_machines() == 1) { @@ -589,12 +609,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size, config_.max_bin, config_.min_data_in_bin, filter_cnt, - bin_type, config_.use_missing, config_.zero_as_missing); + bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size, config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -634,12 +655,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i], total_sample_size, config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i], total_sample_size, config_.max_bin_by_feature[start[rank] + i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -876,6 +898,10 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1); } + // get forced split + std::string forced_bins_path = config_.forcedbins_filename; + std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_); + // check the range of label_idx, weight_idx and group_idx CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_); CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_); @@ -913,12 +939,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[i].data(), static_cast(sample_values[i].size()), sample_data.size(), config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[i].data(), static_cast(sample_values[i].size()), sample_data.size(), config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -959,13 +986,14 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(), static_cast(sample_values[start[rank] + i].size()), sample_data.size(), config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(), static_cast(sample_values[start[rank] + i].size()), sample_data.size(), config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, - config_.use_missing, config_.zero_as_missing); + config_.use_missing, config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json new file mode 100644 index 000000000000..aa74c36ffb78 --- /dev/null +++ b/tests/data/forced_bins.json @@ -0,0 +1,10 @@ +[ + { + "feature": 0, + "bin_upper_bound": [ 0.3, 0.35, 0.4 ] + }, + { + "feature": 1, + "bin_upper_bound": [ -0.1, -0.15, -0.2 ] + } +] \ No newline at end of file diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 63f1468132a5..4c60a23ba4ea 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -915,7 +915,7 @@ def test_max_bin_by_feature(self): } lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) - self.assertEqual(len(np.unique(est.predict(X))), 100) + self.assertEqual(len(np.unique(est.predict(X))), 99) params['max_bin_by_feature'] = [2, 100] lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) From e3f183572fb35f2fcc73144eeb67a8106ea72eca Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:10:21 +1000 Subject: [PATCH 22/49] Fix style issues. --- docs/Parameters.rst | 4 +- .../regression}/forced_bins.json | 2 +- examples/regression/train.conf | 3 ++ include/LightGBM/config.h | 4 +- src/io/bin.cpp | 2 +- src/io/dataset.cpp | 40 ++++++++++--------- src/io/dataset_loader.cpp | 1 + 7 files changed, 32 insertions(+), 24 deletions(-) rename {tests/data => examples/regression}/forced_bins.json (98%) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 1fd11c94bd73..e33b36eb944e 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -416,9 +416,9 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) - - see `this file `__ as an example + - see `this file `__ as an example - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` diff --git a/tests/data/forced_bins.json b/examples/regression/forced_bins.json similarity index 98% rename from tests/data/forced_bins.json rename to examples/regression/forced_bins.json index aa74c36ffb78..1ee0a49d727c 100644 --- a/tests/data/forced_bins.json +++ b/examples/regression/forced_bins.json @@ -7,4 +7,4 @@ "feature": 1, "bin_upper_bound": [ -0.1, -0.15, -0.2 ] } -] \ No newline at end of file +] diff --git a/examples/regression/train.conf b/examples/regression/train.conf index 11396c23ecc2..4c73169dc8f9 100644 --- a/examples/regression/train.conf +++ b/examples/regression/train.conf @@ -29,6 +29,9 @@ is_training_metric = true # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. max_bin = 255 +# forced bin thresholds +# forcedbins_filename = forced_bins.json + # training data # if exsting weight file, should name to "regression.train.weight" # alias: train_data, train diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 049b0bf1f8df..8e0f0608a282 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -409,8 +409,8 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) - // desc = see `this file `__ as an example + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = see `this file `__ as an example std::string forcedbins_filename = ""; // check = >=0.0 diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 5c41edaad9b2..43ab1c8eacdb 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -283,7 +283,7 @@ namespace LightGBM { } } else if (missing_type_ == MissingType::None) { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); + min_data_in_bin, forced_upper_bounds); } else { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin, forced_upper_bounds); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index c931e945cd24..269c06c4c37d 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -5,10 +5,10 @@ #include #include +#include #include #include #include -#include #include #include @@ -1071,24 +1071,28 @@ std::vector> Dataset::GetForcedBins(std::string forced_bins_ std::vector> forced_bins(num_total_features, std::vector()); if (forced_bins_path != "") { std::ifstream forced_bins_stream(forced_bins_path.c_str()); - std::stringstream buffer; - buffer << forced_bins_stream.rdbuf(); - std::string err; - Json forced_bins_json = Json::parse(buffer.str(), err); - CHECK(forced_bins_json.is_array()); - std::vector forced_bins_arr = forced_bins_json.array_items(); - for (int i = 0; i < forced_bins_arr.size(); ++i) { - int feature_num = forced_bins_arr[i]["feature"].int_value(); - CHECK(feature_num < num_total_features); - std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); - for (int j = 0; j < bounds_arr.size(); ++j) { - forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + if (forced_bins_stream.fail()) { + Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str()); + } else { + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); } - } - // remove duplicates - for (int i = 0; i < num_total_features; ++i) { - auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); - forced_bins[i].erase(new_end, forced_bins[i].end()); } } return forced_bins; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index bdfe3b0b4dfc..7a11957558c5 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,6 +2,7 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + #include #include #include From 2280c568715f241fb5adbc533895483803878695 Mon Sep 17 00:00:00 2001 From: btrotta Date: Thu, 15 Aug 2019 19:17:19 +1000 Subject: [PATCH 23/49] Minor style and doc fixes. --- docs/Parameters.rst | 2 +- include/LightGBM/config.h | 2 +- src/io/dataset_loader.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index e33b36eb944e..b971215dcde9 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -416,7 +416,7 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) - see `this file `__ as an example diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 8e0f0608a282..baba482c5e52 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -409,7 +409,7 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) // desc = see `this file `__ as an example std::string forcedbins_filename = ""; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 7a11957558c5..6e60560a9be1 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,8 +2,8 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - #include + #include #include #include From 76fa4ccf2167d9337adfe9803b01025ae7a37b1f Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 13 Aug 2019 18:14:54 +1000 Subject: [PATCH 24/49] Add functionality to force bin thresholds. --- src/io/dataset.cpp | 1 + src/io/dataset_loader.cpp | 1 - tests/data/forced_bins.json | 10 ++++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 tests/data/forced_bins.json diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 269c06c4c37d..2e400387663e 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 6e60560a9be1..bdfe3b0b4dfc 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -3,7 +3,6 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #include - #include #include #include diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json new file mode 100644 index 000000000000..aa74c36ffb78 --- /dev/null +++ b/tests/data/forced_bins.json @@ -0,0 +1,10 @@ +[ + { + "feature": 0, + "bin_upper_bound": [ 0.3, 0.35, 0.4 ] + }, + { + "feature": 1, + "bin_upper_bound": [ -0.1, -0.15, -0.2 ] + } +] \ No newline at end of file From 93d92ebc94a2cb1a9ced2a175945f01eecb9f8ae Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:10:21 +1000 Subject: [PATCH 25/49] Fix style issues. --- src/io/dataset.cpp | 1 - src/io/dataset_loader.cpp | 1 + tests/data/forced_bins.json | 10 ---------- 3 files changed, 1 insertion(+), 11 deletions(-) delete mode 100644 tests/data/forced_bins.json diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 2e400387663e..269c06c4c37d 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index bdfe3b0b4dfc..7a11957558c5 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,6 +2,7 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + #include #include #include diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json deleted file mode 100644 index aa74c36ffb78..000000000000 --- a/tests/data/forced_bins.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "feature": 0, - "bin_upper_bound": [ 0.3, 0.35, 0.4 ] - }, - { - "feature": 1, - "bin_upper_bound": [ -0.1, -0.15, -0.2 ] - } -] \ No newline at end of file From fec30a581974f858a7ed3900f062187c46576f7e Mon Sep 17 00:00:00 2001 From: btrotta Date: Thu, 15 Aug 2019 19:17:19 +1000 Subject: [PATCH 26/49] Minor style and doc fixes. --- src/io/dataset_loader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 7a11957558c5..6e60560a9be1 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,8 +2,8 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - #include + #include #include #include From 503e7b49e2ee2af65fb955bae2afb8b31a8cfd0d Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 20 Aug 2019 21:26:06 +1000 Subject: [PATCH 27/49] Change binning behavior to be same as PR #2342. --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4c60a23ba4ea..63f1468132a5 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -915,7 +915,7 @@ def test_max_bin_by_feature(self): } lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) - self.assertEqual(len(np.unique(est.predict(X))), 99) + self.assertEqual(len(np.unique(est.predict(X))), 100) params['max_bin_by_feature'] = [2, 100] lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) From eecb80c7ce475f671db4f5d64bd5427abe3d89a5 Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 13 Aug 2019 18:14:54 +1000 Subject: [PATCH 28/49] Add functionality to force bin thresholds. --- src/io/bin.cpp | 67 ++++++++++++++++++------ src/io/dataset.cpp | 1 + src/io/dataset_loader.cpp | 1 - tests/data/forced_bins.json | 10 ++++ tests/python_package_test/test_engine.py | 2 +- 5 files changed, 64 insertions(+), 17 deletions(-) create mode 100644 tests/data/forced_bins.json diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 43ab1c8eacdb..2556a59b4715 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -178,16 +178,6 @@ namespace LightGBM { if (left_cnt < 0) { left_cnt = num_distinct_values; } - - if ((left_cnt > 0) && (max_bin > 1)) { - int left_max_bin = static_cast(static_cast(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1)); - left_max_bin = std::max(1, left_max_bin); - bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin); - if (bin_upper_bound.size() > 0) { - bin_upper_bound.back() = -kZeroThreshold; - } - } - int right_start = -1; for (int i = left_cnt; i < num_distinct_values; ++i) { if (distinct_values[i] > kZeroThreshold) { @@ -196,13 +186,60 @@ namespace LightGBM { } } - int right_max_bin = max_bin - 1 - static_cast(bin_upper_bound.size()); - if (right_start >= 0 && right_max_bin > 0) { - auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start, - num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin); + // include zero bounds if possible + if (max_bin == 2) { + if (left_cnt == 0) { + bin_upper_bound.push_back(kZeroThreshold); + } else { + bin_upper_bound.push_back(-kZeroThreshold); + } + } else if (max_bin >= 3) { + bin_upper_bound.push_back(-kZeroThreshold); bin_upper_bound.push_back(kZeroThreshold); } - CHECK(bin_upper_bound.size() <= static_cast(max_bin)); + + // add forced bounds, excluding zeros since we have already added zero bounds + int i = 0; + while (i < forced_upper_bounds.size()) { + if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) { + forced_upper_bounds.erase(forced_upper_bounds.begin() + i); + } else { + ++i; + } + } + bin_upper_bound.push_back(std::numeric_limits::infinity()); + int max_to_insert = max_bin - static_cast(bin_upper_bound.size()); + int num_to_insert = std::min(max_to_insert, static_cast(forced_upper_bounds.size())); + if (num_to_insert > 0) { + bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); + } + std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + + // find remaining bounds + std::vector bounds_to_add; + int value_ind = 0; + for (int i = 0; i < bin_upper_bound.size(); ++i) { + int cnt_in_bin = 0; + int distinct_cnt_in_bin = 0; + int bin_start = value_ind; + while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) { + cnt_in_bin += counts[value_ind]; + ++distinct_cnt_in_bin; + ++value_ind; + } + int bins_remaining = max_bin - static_cast(bin_upper_bound.size()) - static_cast(bounds_to_add.size()); + int num_sub_bins = static_cast(std::lround((static_cast(cnt_in_bin) * bins_remaining / total_sample_cnt))); + num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1; + if (i == bin_upper_bound.size() - 1) { + num_sub_bins = bins_remaining + 1; + } + std::vector new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, + num_sub_bins, cnt_in_bin, min_data_in_bin); + bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity + } + bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); + std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 269c06c4c37d..2e400387663e 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 6e60560a9be1..bdfe3b0b4dfc 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -3,7 +3,6 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #include - #include #include #include diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json new file mode 100644 index 000000000000..aa74c36ffb78 --- /dev/null +++ b/tests/data/forced_bins.json @@ -0,0 +1,10 @@ +[ + { + "feature": 0, + "bin_upper_bound": [ 0.3, 0.35, 0.4 ] + }, + { + "feature": 1, + "bin_upper_bound": [ -0.1, -0.15, -0.2 ] + } +] \ No newline at end of file diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 63f1468132a5..4c60a23ba4ea 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -915,7 +915,7 @@ def test_max_bin_by_feature(self): } lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) - self.assertEqual(len(np.unique(est.predict(X))), 100) + self.assertEqual(len(np.unique(est.predict(X))), 99) params['max_bin_by_feature'] = [2, 100] lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) From a02b3a3eaf91bdfcf163d7fd888e0184ecdadb45 Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:10:21 +1000 Subject: [PATCH 29/49] Fix style issues. --- src/io/dataset.cpp | 1 - src/io/dataset_loader.cpp | 1 + tests/data/forced_bins.json | 10 ---------- 3 files changed, 1 insertion(+), 11 deletions(-) delete mode 100644 tests/data/forced_bins.json diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 2e400387663e..269c06c4c37d 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index bdfe3b0b4dfc..7a11957558c5 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,6 +2,7 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + #include #include #include diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json deleted file mode 100644 index aa74c36ffb78..000000000000 --- a/tests/data/forced_bins.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "feature": 0, - "bin_upper_bound": [ 0.3, 0.35, 0.4 ] - }, - { - "feature": 1, - "bin_upper_bound": [ -0.1, -0.15, -0.2 ] - } -] \ No newline at end of file From cb12379795b6307820620e0a98ad01c4cbf0ff5e Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:19:58 +1000 Subject: [PATCH 30/49] Use stable sort. --- src/io/bin.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 2556a59b4715..b26a6a461e3e 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -213,7 +213,7 @@ namespace LightGBM { if (num_to_insert > 0) { bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); } - std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); // find remaining bounds std::vector bounds_to_add; @@ -238,7 +238,7 @@ namespace LightGBM { bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); - std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } From abe95d787c34084ed431c84ebff2bde9797b0d2a Mon Sep 17 00:00:00 2001 From: btrotta Date: Thu, 15 Aug 2019 19:17:19 +1000 Subject: [PATCH 31/49] Minor style and doc fixes. --- src/io/dataset_loader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 7a11957558c5..6e60560a9be1 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,8 +2,8 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - #include + #include #include #include From 7aed6892e6d9f17a587e86d224e22248e8bffae6 Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 13 Aug 2019 18:14:54 +1000 Subject: [PATCH 32/49] Add functionality to force bin thresholds. --- docs/Parameters.rst | 4 ++-- include/LightGBM/config.h | 4 ++-- src/io/bin.cpp | 6 +++--- src/io/dataset.cpp | 39 +++++++++++++++++-------------------- src/io/dataset_loader.cpp | 1 - tests/data/forced_bins.json | 10 ++++++++++ 6 files changed, 35 insertions(+), 29 deletions(-) create mode 100644 tests/data/forced_bins.json diff --git a/docs/Parameters.rst b/docs/Parameters.rst index b971215dcde9..1fd11c94bd73 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -416,9 +416,9 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) + - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) - - see `this file `__ as an example + - see `this file `__ as an example - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index baba482c5e52..049b0bf1f8df 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -409,8 +409,8 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) - // desc = see `this file `__ as an example + // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + // desc = see `this file `__ as an example std::string forcedbins_filename = ""; // check = >=0.0 diff --git a/src/io/bin.cpp b/src/io/bin.cpp index b26a6a461e3e..62713d1bddd3 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -213,7 +213,7 @@ namespace LightGBM { if (num_to_insert > 0) { bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); } - std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); // find remaining bounds std::vector bounds_to_add; @@ -238,7 +238,7 @@ namespace LightGBM { bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); - std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } @@ -320,7 +320,7 @@ namespace LightGBM { } } else if (missing_type_ == MissingType::None) { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); + min_data_in_bin, forced_upper_bounds); } else { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin, forced_upper_bounds); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 269c06c4c37d..e948754034be 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -1071,29 +1072,25 @@ std::vector> Dataset::GetForcedBins(std::string forced_bins_ std::vector> forced_bins(num_total_features, std::vector()); if (forced_bins_path != "") { std::ifstream forced_bins_stream(forced_bins_path.c_str()); - if (forced_bins_stream.fail()) { - Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str()); - } else { - std::stringstream buffer; - buffer << forced_bins_stream.rdbuf(); - std::string err; - Json forced_bins_json = Json::parse(buffer.str(), err); - CHECK(forced_bins_json.is_array()); - std::vector forced_bins_arr = forced_bins_json.array_items(); - for (int i = 0; i < forced_bins_arr.size(); ++i) { - int feature_num = forced_bins_arr[i]["feature"].int_value(); - CHECK(feature_num < num_total_features); - std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); - for (int j = 0; j < bounds_arr.size(); ++j) { - forced_bins[feature_num].push_back(bounds_arr[j].number_value()); - } - } - // remove duplicates - for (int i = 0; i < num_total_features; ++i) { - auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); - forced_bins[i].erase(new_end, forced_bins[i].end()); + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); } } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); + } } return forced_bins; } diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 6e60560a9be1..bdfe3b0b4dfc 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -3,7 +3,6 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #include - #include #include #include diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json new file mode 100644 index 000000000000..aa74c36ffb78 --- /dev/null +++ b/tests/data/forced_bins.json @@ -0,0 +1,10 @@ +[ + { + "feature": 0, + "bin_upper_bound": [ 0.3, 0.35, 0.4 ] + }, + { + "feature": 1, + "bin_upper_bound": [ -0.1, -0.15, -0.2 ] + } +] \ No newline at end of file From 35ce38bd7eb48a7a7110e8f4206abd07622a26ee Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:10:21 +1000 Subject: [PATCH 33/49] Fix style issues. --- docs/Parameters.rst | 4 ++-- include/LightGBM/config.h | 4 ++-- src/io/bin.cpp | 2 +- src/io/dataset.cpp | 39 ++++++++++++++++++++----------------- src/io/dataset_loader.cpp | 1 + tests/data/forced_bins.json | 10 ---------- 6 files changed, 27 insertions(+), 33 deletions(-) delete mode 100644 tests/data/forced_bins.json diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 1fd11c94bd73..e33b36eb944e 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -416,9 +416,9 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) - - see `this file `__ as an example + - see `this file `__ as an example - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 049b0bf1f8df..8e0f0608a282 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -409,8 +409,8 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) - // desc = see `this file `__ as an example + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = see `this file `__ as an example std::string forcedbins_filename = ""; // check = >=0.0 diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 62713d1bddd3..2556a59b4715 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -320,7 +320,7 @@ namespace LightGBM { } } else if (missing_type_ == MissingType::None) { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); + min_data_in_bin, forced_upper_bounds); } else { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin, forced_upper_bounds); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index e948754034be..269c06c4c37d 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -1072,24 +1071,28 @@ std::vector> Dataset::GetForcedBins(std::string forced_bins_ std::vector> forced_bins(num_total_features, std::vector()); if (forced_bins_path != "") { std::ifstream forced_bins_stream(forced_bins_path.c_str()); - std::stringstream buffer; - buffer << forced_bins_stream.rdbuf(); - std::string err; - Json forced_bins_json = Json::parse(buffer.str(), err); - CHECK(forced_bins_json.is_array()); - std::vector forced_bins_arr = forced_bins_json.array_items(); - for (int i = 0; i < forced_bins_arr.size(); ++i) { - int feature_num = forced_bins_arr[i]["feature"].int_value(); - CHECK(feature_num < num_total_features); - std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); - for (int j = 0; j < bounds_arr.size(); ++j) { - forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + if (forced_bins_stream.fail()) { + Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str()); + } else { + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); } - } - // remove duplicates - for (int i = 0; i < num_total_features; ++i) { - auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); - forced_bins[i].erase(new_end, forced_bins[i].end()); } } return forced_bins; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index bdfe3b0b4dfc..7a11957558c5 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,6 +2,7 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + #include #include #include diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json deleted file mode 100644 index aa74c36ffb78..000000000000 --- a/tests/data/forced_bins.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "feature": 0, - "bin_upper_bound": [ 0.3, 0.35, 0.4 ] - }, - { - "feature": 1, - "bin_upper_bound": [ -0.1, -0.15, -0.2 ] - } -] \ No newline at end of file From 28c046205332312519d049ff536ed26c34f8dd43 Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:19:58 +1000 Subject: [PATCH 34/49] Use stable sort. --- src/io/bin.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 2556a59b4715..b26a6a461e3e 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -213,7 +213,7 @@ namespace LightGBM { if (num_to_insert > 0) { bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); } - std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); // find remaining bounds std::vector bounds_to_add; @@ -238,7 +238,7 @@ namespace LightGBM { bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); - std::sort(bin_upper_bound.begin(), bin_upper_bound.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); CHECK(bin_upper_bound.size() <= max_bin); return bin_upper_bound; } From 23dbb29f4e9631b9430592bea53cafa3c3372e60 Mon Sep 17 00:00:00 2001 From: btrotta Date: Thu, 15 Aug 2019 19:17:19 +1000 Subject: [PATCH 35/49] Minor style and doc fixes. --- docs/Parameters.rst | 2 +- include/LightGBM/config.h | 2 +- src/io/dataset_loader.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index e33b36eb944e..b971215dcde9 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -416,7 +416,7 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) - see `this file `__ as an example diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 8e0f0608a282..baba482c5e52 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -409,7 +409,7 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) // desc = see `this file `__ as an example std::string forcedbins_filename = ""; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 7a11957558c5..6e60560a9be1 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,8 +2,8 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - #include + #include #include #include From 9ed04a336b9839366c93b66f050ad52917ed0b68 Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 20 Aug 2019 21:26:06 +1000 Subject: [PATCH 36/49] Change binning behavior to be same as PR #2342. --- src/io/bin.cpp | 14 +++++++++----- tests/python_package_test/test_engine.py | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index b26a6a461e3e..40da30c6ad2d 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -186,7 +186,7 @@ namespace LightGBM { } } - // include zero bounds if possible + // include zero bounds and infinity bound if (max_bin == 2) { if (left_cnt == 0) { bin_upper_bound.push_back(kZeroThreshold); @@ -194,9 +194,14 @@ namespace LightGBM { bin_upper_bound.push_back(-kZeroThreshold); } } else if (max_bin >= 3) { - bin_upper_bound.push_back(-kZeroThreshold); - bin_upper_bound.push_back(kZeroThreshold); + if (left_cnt > 0) { + bin_upper_bound.push_back(-kZeroThreshold); + } + if (right_start >= 0) { + bin_upper_bound.push_back(kZeroThreshold); + } } + bin_upper_bound.push_back(std::numeric_limits::infinity()); // add forced bounds, excluding zeros since we have already added zero bounds int i = 0; @@ -207,7 +212,6 @@ namespace LightGBM { ++i; } } - bin_upper_bound.push_back(std::numeric_limits::infinity()); int max_to_insert = max_bin - static_cast(bin_upper_bound.size()); int num_to_insert = std::min(max_to_insert, static_cast(forced_upper_bounds.size())); if (num_to_insert > 0) { @@ -239,7 +243,7 @@ namespace LightGBM { } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); - CHECK(bin_upper_bound.size() <= max_bin); + CHECK(bin_upper_bound.size() <= static_cast(max_bin)); return bin_upper_bound; } diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4c60a23ba4ea..63f1468132a5 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -915,7 +915,7 @@ def test_max_bin_by_feature(self): } lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) - self.assertEqual(len(np.unique(est.predict(X))), 99) + self.assertEqual(len(np.unique(est.predict(X))), 100) params['max_bin_by_feature'] = [2, 100] lgb_data = lgb.Dataset(X, label=y) est = lgb.train(params, lgb_data, num_boost_round=1) From 51e93a9d4d3b5f8cdd18dd0f4111b69d90b6fa5d Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 11 Sep 2019 22:05:00 +1000 Subject: [PATCH 37/49] Use different bin finding function for predefined bounds. --- src/io/bin.cpp | 90 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 84 insertions(+), 6 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 40da30c6ad2d..6f9b7a471177 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -149,7 +149,69 @@ namespace LightGBM { return bin_upper_bound; } + std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, + int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) { + std::vector bin_upper_bound; + int left_cnt_data = 0; + int cnt_zero = 0; + int right_cnt_data = 0; + for (int i = 0; i < num_distinct_values; ++i) { + if (distinct_values[i] <= -kZeroThreshold) { + left_cnt_data += counts[i]; + } + else if (distinct_values[i] > kZeroThreshold) { + right_cnt_data += counts[i]; + } + else { + cnt_zero += counts[i]; + } + } + + int left_cnt = -1; + for (int i = 0; i < num_distinct_values; ++i) { + if (distinct_values[i] > -kZeroThreshold) { + left_cnt = i; + break; + } + } + + if (left_cnt < 0) { + left_cnt = num_distinct_values; + } + + if ((left_cnt > 0) && (max_bin > 1)) { + int left_max_bin = static_cast(static_cast(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1)); + left_max_bin = std::max(1, left_max_bin); + bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin); + if (bin_upper_bound.size() > 0) { + bin_upper_bound.back() = -kZeroThreshold; + } + } + + int right_start = -1; + for (int i = left_cnt; i < num_distinct_values; ++i) { + if (distinct_values[i] > kZeroThreshold) { + right_start = i; + break; + } + } + + int right_max_bin = max_bin - 1 - static_cast(bin_upper_bound.size()); + if (right_start >= 0 && right_max_bin > 0) { + auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start, + num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin); + bin_upper_bound.push_back(kZeroThreshold); + bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end()); + } + else { + bin_upper_bound.push_back(std::numeric_limits::infinity()); + } + CHECK(bin_upper_bound.size() <= static_cast(max_bin)); + return bin_upper_bound; + } + + std::vector FindBinWithPredefinedBin(const double* distinct_values, const int* counts, int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector forced_upper_bounds) { std::vector bin_upper_bound; @@ -317,17 +379,33 @@ namespace LightGBM { int num_distinct_values = static_cast(distinct_values.size()); if (bin_type_ == BinType::NumericalBin) { if (missing_type_ == MissingType::Zero) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); + auto empty_vec = std::vector(); + if (forced_upper_bounds.size() == 0) { + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin); + } else { + bin_upper_bound_ = FindBinWithPredefinedBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); + } if (bin_upper_bound_.size() == 2) { missing_type_ = MissingType::None; } } else if (missing_type_ == MissingType::None) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); + if (forced_upper_bounds.size() == 0) { + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin); + } else { + bin_upper_bound_ = FindBinWithPredefinedBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); + } } else { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, - min_data_in_bin, forced_upper_bounds); + if (forced_upper_bounds.size() == 0) { + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, + min_data_in_bin); + } else { + bin_upper_bound_ = FindBinWithPredefinedBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, + min_data_in_bin, forced_upper_bounds); + } bin_upper_bound_.push_back(NaN); } num_bin_ = static_cast(bin_upper_bound_.size()); From 4e3355a4699aa1d5c49866c51e9c7aa27b203366 Mon Sep 17 00:00:00 2001 From: btrotta Date: Thu, 12 Sep 2019 19:21:00 +1000 Subject: [PATCH 38/49] Fix style issues. --- src/io/bin.cpp | 13 ++++--------- src/io/dataset.cpp | 4 ++-- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 6f9b7a471177..88da7991e9b0 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -149,7 +149,6 @@ namespace LightGBM { return bin_upper_bound; } - std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) { std::vector bin_upper_bound; @@ -159,11 +158,9 @@ namespace LightGBM { for (int i = 0; i < num_distinct_values; ++i) { if (distinct_values[i] <= -kZeroThreshold) { left_cnt_data += counts[i]; - } - else if (distinct_values[i] > kZeroThreshold) { + } else if (distinct_values[i] > kZeroThreshold) { right_cnt_data += counts[i]; - } - else { + } else { cnt_zero += counts[i]; } } @@ -203,8 +200,7 @@ namespace LightGBM { num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin); bin_upper_bound.push_back(kZeroThreshold); bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end()); - } - else { + } else { bin_upper_bound.push_back(std::numeric_limits::infinity()); } CHECK(bin_upper_bound.size() <= static_cast(max_bin)); @@ -300,7 +296,7 @@ namespace LightGBM { num_sub_bins = bins_remaining + 1; } std::vector new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, - num_sub_bins, cnt_in_bin, min_data_in_bin); + num_sub_bins, cnt_in_bin, min_data_in_bin); bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); @@ -379,7 +375,6 @@ namespace LightGBM { int num_distinct_values = static_cast(distinct_values.size()); if (bin_type_ == BinType::NumericalBin) { if (missing_type_ == MissingType::Zero) { - auto empty_vec = std::vector(); if (forced_upper_bounds.size() == 0) { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 269c06c4c37d..21977a660de0 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -10,12 +10,12 @@ #include #include -#include #include #include +#include +#include #include #include -#include using namespace json11; From 821b2ab440541136f045a6261f9bfd48cf37fda3 Mon Sep 17 00:00:00 2001 From: btrotta Date: Thu, 12 Sep 2019 21:10:42 +1000 Subject: [PATCH 39/49] Minor refactoring, overload FindBinWithZeroAsOneBin. --- src/io/bin.cpp | 161 ++++++++++++++++++++++++------------------------- 1 file changed, 78 insertions(+), 83 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 88da7991e9b0..8ca57b936b08 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -71,7 +71,7 @@ namespace LightGBM { return true; } - std::vector GreedyFindBin(const double* distinct_values, const int* counts, + std::vector GreedyFindBin(const double* distinct_values, const int* counts, int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) { std::vector bin_upper_bound; CHECK(max_bin > 0); @@ -149,64 +149,6 @@ namespace LightGBM { return bin_upper_bound; } - std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, - int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) { - std::vector bin_upper_bound; - int left_cnt_data = 0; - int cnt_zero = 0; - int right_cnt_data = 0; - for (int i = 0; i < num_distinct_values; ++i) { - if (distinct_values[i] <= -kZeroThreshold) { - left_cnt_data += counts[i]; - } else if (distinct_values[i] > kZeroThreshold) { - right_cnt_data += counts[i]; - } else { - cnt_zero += counts[i]; - } - } - - int left_cnt = -1; - for (int i = 0; i < num_distinct_values; ++i) { - if (distinct_values[i] > -kZeroThreshold) { - left_cnt = i; - break; - } - } - - if (left_cnt < 0) { - left_cnt = num_distinct_values; - } - - if ((left_cnt > 0) && (max_bin > 1)) { - int left_max_bin = static_cast(static_cast(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1)); - left_max_bin = std::max(1, left_max_bin); - bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin); - if (bin_upper_bound.size() > 0) { - bin_upper_bound.back() = -kZeroThreshold; - } - } - - int right_start = -1; - for (int i = left_cnt; i < num_distinct_values; ++i) { - if (distinct_values[i] > kZeroThreshold) { - right_start = i; - break; - } - } - - int right_max_bin = max_bin - 1 - static_cast(bin_upper_bound.size()); - if (right_start >= 0 && right_max_bin > 0) { - auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start, - num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin); - bin_upper_bound.push_back(kZeroThreshold); - bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end()); - } else { - bin_upper_bound.push_back(std::numeric_limits::infinity()); - } - CHECK(bin_upper_bound.size() <= static_cast(max_bin)); - return bin_upper_bound; - } - std::vector FindBinWithPredefinedBin(const double* distinct_values, const int* counts, int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector forced_upper_bounds) { std::vector bin_upper_bound; @@ -260,7 +202,7 @@ namespace LightGBM { } } bin_upper_bound.push_back(std::numeric_limits::infinity()); - + // add forced bounds, excluding zeros since we have already added zero bounds int i = 0; while (i < forced_upper_bounds.size()) { @@ -295,8 +237,8 @@ namespace LightGBM { if (i == bin_upper_bound.size() - 1) { num_sub_bins = bins_remaining + 1; } - std::vector new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, - num_sub_bins, cnt_in_bin, min_data_in_bin); + std::vector new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, + num_sub_bins, cnt_in_bin, min_data_in_bin); bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity } bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); @@ -305,6 +247,74 @@ namespace LightGBM { return bin_upper_bound; } + std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values, + int max_bin, size_t total_sample_cnt, int min_data_in_bin) { + std::vector bin_upper_bound; + int left_cnt_data = 0; + int cnt_zero = 0; + int right_cnt_data = 0; + for (int i = 0; i < num_distinct_values; ++i) { + if (distinct_values[i] <= -kZeroThreshold) { + left_cnt_data += counts[i]; + } else if (distinct_values[i] > kZeroThreshold) { + right_cnt_data += counts[i]; + } else { + cnt_zero += counts[i]; + } + } + + int left_cnt = -1; + for (int i = 0; i < num_distinct_values; ++i) { + if (distinct_values[i] > -kZeroThreshold) { + left_cnt = i; + break; + } + } + + if (left_cnt < 0) { + left_cnt = num_distinct_values; + } + + if ((left_cnt > 0) && (max_bin > 1)) { + int left_max_bin = static_cast(static_cast(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1)); + left_max_bin = std::max(1, left_max_bin); + bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin); + if (bin_upper_bound.size() > 0) { + bin_upper_bound.back() = -kZeroThreshold; + } + } + + int right_start = -1; + for (int i = left_cnt; i < num_distinct_values; ++i) { + if (distinct_values[i] > kZeroThreshold) { + right_start = i; + break; + } + } + + int right_max_bin = max_bin - 1 - static_cast(bin_upper_bound.size()); + if (right_start >= 0 && right_max_bin > 0) { + auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start, + num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin); + bin_upper_bound.push_back(kZeroThreshold); + bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end()); + } else { + bin_upper_bound.push_back(std::numeric_limits::infinity()); + } + CHECK(bin_upper_bound.size() <= static_cast(max_bin)); + return bin_upper_bound; + } + + std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values, + int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector forced_upper_bounds) { + if (forced_upper_bounds.empty()) { + return FindBinWithZeroAsOneBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + } else { + return FindBinWithPredefinedBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin, + forced_upper_bounds); + } + } + void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, std::vector forced_upper_bounds) { @@ -375,32 +385,17 @@ namespace LightGBM { int num_distinct_values = static_cast(distinct_values.size()); if (bin_type_ == BinType::NumericalBin) { if (missing_type_ == MissingType::Zero) { - if (forced_upper_bounds.size() == 0) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin); - } else { - bin_upper_bound_ = FindBinWithPredefinedBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); - } + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); if (bin_upper_bound_.size() == 2) { missing_type_ = MissingType::None; } } else if (missing_type_ == MissingType::None) { - if (forced_upper_bounds.size() == 0) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin); - } else { - bin_upper_bound_ = FindBinWithPredefinedBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); - } + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); } else { - if (forced_upper_bounds.size() == 0) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, - min_data_in_bin); - } else { - bin_upper_bound_ = FindBinWithPredefinedBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, - min_data_in_bin, forced_upper_bounds); - } + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, + min_data_in_bin, forced_upper_bounds); bin_upper_bound_.push_back(NaN); } num_bin_ = static_cast(bin_upper_bound_.size()); From 8a5244481e1547eb3e730ad65f35d917ca678343 Mon Sep 17 00:00:00 2001 From: btrotta Date: Fri, 13 Sep 2019 18:43:10 +1000 Subject: [PATCH 40/49] Fix style issues. --- src/io/bin.cpp | 4 ++-- src/io/dataset.cpp | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 8ca57b936b08..d5f0832961f6 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -204,7 +204,7 @@ namespace LightGBM { bin_upper_bound.push_back(std::numeric_limits::infinity()); // add forced bounds, excluding zeros since we have already added zero bounds - int i = 0; + size_t i = 0; while (i < forced_upper_bounds.size()) { if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) { forced_upper_bounds.erase(forced_upper_bounds.begin() + i); @@ -222,7 +222,7 @@ namespace LightGBM { // find remaining bounds std::vector bounds_to_add; int value_ind = 0; - for (int i = 0; i < bin_upper_bound.size(); ++i) { + for (size_t i = 0; i < bin_upper_bound.size(); ++i) { int cnt_in_bin = 0; int distinct_cnt_in_bin = 0; int bin_start = value_ind; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 21977a660de0..8048dafdec4e 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -10,10 +10,10 @@ #include #include +#include #include #include #include -#include #include #include @@ -726,7 +726,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { int num_bounds = static_cast(forced_bin_bounds_[i].size()); writer->Write(&num_bounds, sizeof(int)); - for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) { + for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) { writer->Write(&forced_bin_bounds_[i][j], sizeof(double)); } } @@ -782,7 +782,7 @@ void Dataset::DumpTextFile(const char* text_filename) { fprintf(file, "\nforced_bins: "); for (int i = 0; i < num_total_features_; ++i) { fprintf(file, "\nfeature %d: ", i); - for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) { + for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) { fprintf(file, "%lf, ", forced_bin_bounds_[i][j]); } } @@ -1080,11 +1080,11 @@ std::vector> Dataset::GetForcedBins(std::string forced_bins_ Json forced_bins_json = Json::parse(buffer.str(), err); CHECK(forced_bins_json.is_array()); std::vector forced_bins_arr = forced_bins_json.array_items(); - for (int i = 0; i < forced_bins_arr.size(); ++i) { + for (size_t i = 0; i < forced_bins_arr.size(); ++i) { int feature_num = forced_bins_arr[i]["feature"].int_value(); CHECK(feature_num < num_total_features); std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); - for (int j = 0; j < bounds_arr.size(); ++j) { + for (size_t j = 0; j < bounds_arr.size(); ++j) { forced_bins[feature_num].push_back(bounds_arr[j].number_value()); } } From c591e7b8c40de69cd27b2b00f683efd639216046 Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 17 Sep 2019 18:06:56 +1000 Subject: [PATCH 41/49] Fix bug and add new test. --- examples/regression/forced_bins2.json | 6 ++++++ src/io/bin.cpp | 3 ++- tests/python_package_test/test_engine.py | 9 +++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 examples/regression/forced_bins2.json diff --git a/examples/regression/forced_bins2.json b/examples/regression/forced_bins2.json new file mode 100644 index 000000000000..f4dca0ccaf34 --- /dev/null +++ b/examples/regression/forced_bins2.json @@ -0,0 +1,6 @@ +[ + { + "feature": 0, + "bin_upper_bound": [ 0.19, 0.39, 0.59, 0.79 ] + } +] diff --git a/src/io/bin.cpp b/src/io/bin.cpp index d5f0832961f6..23a19273bfbf 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -220,6 +220,7 @@ namespace LightGBM { std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); // find remaining bounds + int free_bins = max_bin - static_cast(bin_upper_bound.size()); std::vector bounds_to_add; int value_ind = 0; for (size_t i = 0; i < bin_upper_bound.size(); ++i) { @@ -232,7 +233,7 @@ namespace LightGBM { ++value_ind; } int bins_remaining = max_bin - static_cast(bin_upper_bound.size()) - static_cast(bounds_to_add.size()); - int num_sub_bins = static_cast(std::lround((static_cast(cnt_in_bin) * bins_remaining / total_sample_cnt))); + int num_sub_bins = static_cast(std::lround((static_cast(cnt_in_bin) * free_bins / total_sample_cnt))); num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1; if (i == bin_upper_bound.size() - 1) { num_sub_bins = bins_remaining + 1; diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index d36c588164c2..f972a16d8368 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1642,6 +1642,15 @@ def test_forced_bins(self): est = lgb.train(params, lgb_x, num_boost_round=100) predicted = est.predict(new_x) self.assertEqual(len(np.unique(predicted)), 3) + params['forcedbins_filename'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../../examples/regression/forced_bins2.json') + params['max_bin'] = 11 + lgb_x = lgb.Dataset(x[:, :1], label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + predicted = est.predict(x[1:, :1]) + vals, counts = np.unique(predicted, return_counts=True) + self.assertGreaterEqual(min(counts), 9) + self.assertLessEqual(max(counts), 11) def test_binning_same_sign(self): # test that binning works properly for features with only positive or only negative values From 9c767ae0f3701e4e02ffcfbbe3bf28afe7a10667 Mon Sep 17 00:00:00 2001 From: btrotta Date: Sat, 21 Sep 2019 13:54:43 +1000 Subject: [PATCH 42/49] Add warning when using categorical features with forced bins. --- include/LightGBM/dataset.h | 3 ++- src/io/dataset.cpp | 27 +++++++++++++++++++++------ src/io/dataset_loader.cpp | 5 +++-- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 900487eafbf4..6b79ac42770c 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -596,7 +596,8 @@ class Dataset { void addFeaturesFrom(Dataset* other); - static std::vector> GetForcedBins(std::string forced_bins_path, int num_total_features); + static std::vector> GetForcedBins(std::string forced_bins_path, int num_total_features, + std::unordered_set categorical_features); private: std::string data_filename_; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 8048dafdec4e..6d16fefdeffc 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -329,7 +329,14 @@ void Dataset::Construct( max_bin_by_feature_.resize(num_total_features_); max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end()); } - forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_); + // get categorical features from the bin types so that we can read the forced bin bounds + std::unordered_set categorical_features; + for (int i = 0; i < num_total_features_; ++i){ + if ((bin_mappers[i] != nullptr) && (bin_mappers[i]->bin_type() == BinType::CategoricalBin)){ + categorical_features.insert(i); + } + } + forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_, categorical_features); max_bin_ = io_config.max_bin; min_data_in_bin_ = io_config.min_data_in_bin; bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt; @@ -363,7 +370,10 @@ void Dataset::ResetConfig(const char* parameters) { Log::Warning("Cannot change sparse_threshold after constructed Dataset handle."); } if (param.count("forcedbins_filename")) { - std::vector> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_); + /* Since the dataset is already constructed we don't know which bins are categorical. + Therefore read forced bins assuming no categorical features, and warn if not the same as original. */ + std::vector> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, + num_total_features_, std::unordered_set()); if (config_bounds != forced_bin_bounds_) { Log::Warning("Cannot change forced bins after constructed Dataset handle."); } @@ -1067,7 +1077,8 @@ void Dataset::addFeaturesFrom(Dataset* other) { } -std::vector> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features) { +std::vector> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features, + std::unordered_set categorical_features) { std::vector> forced_bins(num_total_features, std::vector()); if (forced_bins_path != "") { std::ifstream forced_bins_stream(forced_bins_path.c_str()); @@ -1083,9 +1094,13 @@ std::vector> Dataset::GetForcedBins(std::string forced_bins_ for (size_t i = 0; i < forced_bins_arr.size(); ++i) { int feature_num = forced_bins_arr[i]["feature"].int_value(); CHECK(feature_num < num_total_features); - std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); - for (size_t j = 0; j < bounds_arr.size(); ++j) { - forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + if (categorical_features.count(feature_num)) { + Log::Warning("Feature %d is categorical. Will ignore forced bins for this feature.", feature_num); + } else { + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (size_t j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } } } // remove duplicates diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 6e60560a9be1..005bf8082011 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -584,7 +584,7 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, // get forced split std::string forced_bins_path = config_.forcedbins_filename; - std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col); + std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col, categorical_features_); const data_size_t filter_cnt = static_cast( static_cast(config_.min_data_in_leaf * total_sample_size) / num_data); @@ -901,7 +901,8 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, // get forced split std::string forced_bins_path = config_.forcedbins_filename; - std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_); + std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_, + categorical_features_); // check the range of label_idx, weight_idx and group_idx CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_); From cf0afd40d9f89a06ccdd68ba2caab3b7f64d4ba5 Mon Sep 17 00:00:00 2001 From: btrotta Date: Sat, 21 Sep 2019 14:03:15 +1000 Subject: [PATCH 43/49] Pass forced_upper_bounds by reference. --- src/io/bin.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 23a19273bfbf..1a083c60cff0 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -150,7 +150,7 @@ namespace LightGBM { } std::vector FindBinWithPredefinedBin(const double* distinct_values, const int* counts, - int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector forced_upper_bounds) { + int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector& forced_upper_bounds) { std::vector bin_upper_bound; // get list of distinct values @@ -307,7 +307,7 @@ namespace LightGBM { } std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values, - int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector forced_upper_bounds) { + int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector& forced_upper_bounds) { if (forced_upper_bounds.empty()) { return FindBinWithZeroAsOneBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); } else { From 25387ec31e4d16b4f4b4072f891c668d9d4656d3 Mon Sep 17 00:00:00 2001 From: btrotta Date: Sat, 21 Sep 2019 15:14:40 +1000 Subject: [PATCH 44/49] Pass container types by const reference. --- include/LightGBM/bin.h | 2 +- include/LightGBM/dataset.h | 2 +- src/io/bin.cpp | 26 ++++++++++++-------------- src/io/dataset.cpp | 2 +- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 1c5f62cd1907..7ea86acdd764 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -149,7 +149,7 @@ class BinMapper { * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm) */ void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, - bool use_missing, bool zero_as_missing, std::vector forced_upper_bounds); + bool use_missing, bool zero_as_missing, const std::vector& forced_upper_bounds); /*! * \brief Use specific number of bin to calculate the size of this class diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 6b79ac42770c..5aa0f8e21aef 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -597,7 +597,7 @@ class Dataset { void addFeaturesFrom(Dataset* other); static std::vector> GetForcedBins(std::string forced_bins_path, int num_total_features, - std::unordered_set categorical_features); + const std::unordered_set& categorical_features); private: std::string data_filename_; diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 1a083c60cff0..94349e572f52 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -150,7 +150,7 @@ namespace LightGBM { } std::vector FindBinWithPredefinedBin(const double* distinct_values, const int* counts, - int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector& forced_upper_bounds) { + int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, const std::vector& forced_upper_bounds) { std::vector bin_upper_bound; // get list of distinct values @@ -204,18 +204,16 @@ namespace LightGBM { bin_upper_bound.push_back(std::numeric_limits::infinity()); // add forced bounds, excluding zeros since we have already added zero bounds - size_t i = 0; - while (i < forced_upper_bounds.size()) { - if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) { - forced_upper_bounds.erase(forced_upper_bounds.begin() + i); - } else { - ++i; - } - } int max_to_insert = max_bin - static_cast(bin_upper_bound.size()); - int num_to_insert = std::min(max_to_insert, static_cast(forced_upper_bounds.size())); - if (num_to_insert > 0) { - bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); + int num_inserted = 0; + for (size_t i = 0; i < forced_upper_bounds.size(); ++i) { + if (std::fabs(forced_upper_bounds[i]) > kZeroThreshold) { + bin_upper_bound.push_back(forced_upper_bounds[i]); + ++num_inserted; + } + if (num_inserted >= max_to_insert) { + break; + } } std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); @@ -307,7 +305,7 @@ namespace LightGBM { } std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values, - int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector& forced_upper_bounds) { + int max_bin, size_t total_sample_cnt, int min_data_in_bin, const std::vector& forced_upper_bounds) { if (forced_upper_bounds.empty()) { return FindBinWithZeroAsOneBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); } else { @@ -318,7 +316,7 @@ namespace LightGBM { void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, - std::vector forced_upper_bounds) { + const std::vector& forced_upper_bounds) { int na_cnt = 0; int tmp_num_sample_values = 0; for (int i = 0; i < num_sample_values; ++i) { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 6d16fefdeffc..3b3af864684d 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1078,7 +1078,7 @@ void Dataset::addFeaturesFrom(Dataset* other) { std::vector> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features, - std::unordered_set categorical_features) { + const std::unordered_set& categorical_features) { std::vector> forced_bins(num_total_features, std::vector()); if (forced_bins_path != "") { std::ifstream forced_bins_stream(forced_bins_path.c_str()); From cc249f0727c92b521c757187236e4c871cf59a85 Mon Sep 17 00:00:00 2001 From: btrotta Date: Mon, 23 Sep 2019 22:07:08 +1000 Subject: [PATCH 45/49] Get categorical features using FeatureBinMapper. --- src/io/dataset.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 3b3af864684d..5a6cc2773e67 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -370,10 +370,17 @@ void Dataset::ResetConfig(const char* parameters) { Log::Warning("Cannot change sparse_threshold after constructed Dataset handle."); } if (param.count("forcedbins_filename")) { - /* Since the dataset is already constructed we don't know which bins are categorical. - Therefore read forced bins assuming no categorical features, and warn if not the same as original. */ + // get categorical features from the bin types so that we can read the forced bin bounds + std::unordered_set categorical_features; + for (int i = 0; i < num_total_features_; ++i) { + int fidx = used_feature_map_[i]; + const BinMapper* bin_mapper = FeatureBinMapper(fidx); + if (bin_mapper->bin_type() == BinType::CategoricalBin) { + categorical_features.insert(i); + } + } std::vector> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, - num_total_features_, std::unordered_set()); + num_total_features_, categorical_features); if (config_bounds != forced_bin_bounds_) { Log::Warning("Cannot change forced bins after constructed Dataset handle."); } From 0e26e9f33afe92e638f82efb02751409d7990e1d Mon Sep 17 00:00:00 2001 From: btrotta Date: Mon, 23 Sep 2019 22:11:12 +1000 Subject: [PATCH 46/49] Fix bug for small max_bin. --- src/io/bin.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 94349e572f52..c10d87af42a8 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -207,13 +207,13 @@ namespace LightGBM { int max_to_insert = max_bin - static_cast(bin_upper_bound.size()); int num_inserted = 0; for (size_t i = 0; i < forced_upper_bounds.size(); ++i) { + if (num_inserted >= max_to_insert) { + break; + } if (std::fabs(forced_upper_bounds[i]) > kZeroThreshold) { bin_upper_bound.push_back(forced_upper_bounds[i]); ++num_inserted; } - if (num_inserted >= max_to_insert) { - break; - } } std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); From b5752ec0a1d9887d7aa4e78720ed87d46e0dfcdf Mon Sep 17 00:00:00 2001 From: btrotta Date: Fri, 27 Sep 2019 19:06:24 +1000 Subject: [PATCH 47/49] Move GetForcedBins to DatasetLoader. --- include/LightGBM/dataset.h | 3 -- include/LightGBM/dataset_loader.h | 3 ++ src/io/dataset.cpp | 60 ++----------------------------- src/io/dataset_loader.cpp | 49 +++++++++++++++++++++++-- 4 files changed, 52 insertions(+), 63 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index cf5129ab6392..bdd693d967c5 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -596,9 +596,6 @@ class Dataset { void addFeaturesFrom(Dataset* other); - static std::vector> GetForcedBins(std::string forced_bins_path, int num_total_features, - const std::unordered_set& categorical_features); - private: std::string data_filename_; /*! \brief Store used features */ diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index ed4c2af93dc7..c5555ef387be 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -36,6 +36,9 @@ class DatasetLoader { /*! \brief Disable copy */ DatasetLoader(const DatasetLoader&) = delete; + static std::vector> GetForcedBins(std::string forced_bins_path, int num_total_features, + const std::unordered_set& categorical_features); + private: Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index b70e6fb19952..ea3e35fa6452 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -3,9 +3,9 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #include +#include #include -#include #include #include #include @@ -13,12 +13,9 @@ #include #include #include -#include #include #include -using namespace json11; - namespace LightGBM { @@ -336,7 +333,7 @@ void Dataset::Construct( categorical_features.insert(i); } } - forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_, categorical_features); + forced_bin_bounds_ = DatasetLoader::GetForcedBins(io_config.forcedbins_filename, num_total_features_, categorical_features); max_bin_ = io_config.max_bin; min_data_in_bin_ = io_config.min_data_in_bin; bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt; @@ -370,20 +367,7 @@ void Dataset::ResetConfig(const char* parameters) { Log::Warning("Cannot change sparse_threshold after constructed Dataset handle."); } if (param.count("forcedbins_filename")) { - // get categorical features from the bin types so that we can read the forced bin bounds - std::unordered_set categorical_features; - for (int i = 0; i < num_total_features_; ++i) { - int fidx = used_feature_map_[i]; - const BinMapper* bin_mapper = FeatureBinMapper(fidx); - if (bin_mapper->bin_type() == BinType::CategoricalBin) { - categorical_features.insert(i); - } - } - std::vector> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, - num_total_features_, categorical_features); - if (config_bounds != forced_bin_bounds_) { - Log::Warning("Cannot change forced bins after constructed Dataset handle."); - } + Log::Warning("Cannot change forced bins after constructed Dataset handle."); } if (!io_config.monotone_constraints.empty()) { @@ -1084,42 +1068,4 @@ void Dataset::addFeaturesFrom(Dataset* other) { num_groups_ += other->num_groups_; } - -std::vector> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features, - const std::unordered_set& categorical_features) { - std::vector> forced_bins(num_total_features, std::vector()); - if (forced_bins_path != "") { - std::ifstream forced_bins_stream(forced_bins_path.c_str()); - if (forced_bins_stream.fail()) { - Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str()); - } else { - std::stringstream buffer; - buffer << forced_bins_stream.rdbuf(); - std::string err; - Json forced_bins_json = Json::parse(buffer.str(), err); - CHECK(forced_bins_json.is_array()); - std::vector forced_bins_arr = forced_bins_json.array_items(); - for (size_t i = 0; i < forced_bins_arr.size(); ++i) { - int feature_num = forced_bins_arr[i]["feature"].int_value(); - CHECK(feature_num < num_total_features); - if (categorical_features.count(feature_num)) { - Log::Warning("Feature %d is categorical. Will ignore forced bins for this feature.", feature_num); - } else { - std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); - for (size_t j = 0; j < bounds_arr.size(); ++j) { - forced_bins[feature_num].push_back(bounds_arr[j].number_value()); - } - } - } - // remove duplicates - for (int i = 0; i < num_total_features; ++i) { - auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); - forced_bins[i].erase(new_end, forced_bins[i].end()); - } - } - } - return forced_bins; -} - - } // namespace LightGBM diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index b930a2f7449c..476a5a7ae0b4 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -4,11 +4,16 @@ */ #include +#include #include #include #include #include +#include + +using namespace json11; + namespace LightGBM { DatasetLoader::DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename) @@ -584,7 +589,7 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, // get forced split std::string forced_bins_path = config_.forcedbins_filename; - std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col, categorical_features_); + std::vector> forced_bin_bounds = DatasetLoader::GetForcedBins(forced_bins_path, num_col, categorical_features_); const data_size_t filter_cnt = static_cast( static_cast(config_.min_data_in_leaf * total_sample_size) / num_data); @@ -901,8 +906,8 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, // get forced split std::string forced_bins_path = config_.forcedbins_filename; - std::vector> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_, - categorical_features_); + std::vector> forced_bin_bounds = DatasetLoader::GetForcedBins(forced_bins_path, dataset->num_total_features_, + categorical_features_); // check the range of label_idx, weight_idx and group_idx CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_); @@ -1237,4 +1242,42 @@ std::string DatasetLoader::CheckCanLoadFromBin(const char* filename) { } } + + +std::vector> DatasetLoader::GetForcedBins(std::string forced_bins_path, int num_total_features, + const std::unordered_set& categorical_features) { + std::vector> forced_bins(num_total_features, std::vector()); + if (forced_bins_path != "") { + std::ifstream forced_bins_stream(forced_bins_path.c_str()); + if (forced_bins_stream.fail()) { + Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str()); + } else { + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (size_t i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + if (categorical_features.count(feature_num)) { + Log::Warning("Feature %d is categorical. Will ignore forced bins for this feature.", feature_num); + } else { + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (size_t j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); + } + } + } + return forced_bins; +} + } // namespace LightGBM From 58d86aa6bfe1659b75853cc45d30673c345c0306 Mon Sep 17 00:00:00 2001 From: btrotta Date: Sat, 28 Sep 2019 13:24:23 +1000 Subject: [PATCH 48/49] Find forced bins in dataset_loader. --- include/LightGBM/dataset.h | 1 + src/io/dataset.cpp | 11 ++--------- src/io/dataset_loader.cpp | 4 ++-- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index bdd693d967c5..dd52571efbc1 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -290,6 +290,7 @@ class Dataset { void Construct( std::vector>* bin_mappers, + std::vector>& forced_bins, int** sample_non_zero_indices, const int* num_per_col, size_t total_sample_cnt, diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index ea3e35fa6452..7d74b4fa3f5b 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -3,7 +3,6 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #include -#include #include #include @@ -216,6 +215,7 @@ std::vector> FastFeatureBundling(const std::vector>* bin_mappers, + std::vector>& forced_bins, int** sample_non_zero_indices, const int* num_per_col, size_t total_sample_cnt, @@ -326,14 +326,7 @@ void Dataset::Construct( max_bin_by_feature_.resize(num_total_features_); max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end()); } - // get categorical features from the bin types so that we can read the forced bin bounds - std::unordered_set categorical_features; - for (int i = 0; i < num_total_features_; ++i){ - if ((bin_mappers->at(i) != nullptr) && (bin_mappers->at(i)->bin_type() == BinType::CategoricalBin)){ - categorical_features.insert(i); - } - } - forced_bin_bounds_ = DatasetLoader::GetForcedBins(io_config.forcedbins_filename, num_total_features_, categorical_features); + forced_bin_bounds_ = forced_bins; max_bin_ = io_config.max_bin; min_data_in_bin_ = io_config.min_data_in_bin; bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 476a5a7ae0b4..da0f6b9dfc32 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -720,7 +720,7 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, } } auto dataset = std::unique_ptr(new Dataset(num_data)); - dataset->Construct(&bin_mappers, sample_indices, num_per_col, total_sample_size, config_); + dataset->Construct(&bin_mappers, forced_bin_bounds, sample_indices, num_per_col, total_sample_size, config_); dataset->set_feature_names(feature_names_); return dataset.release(); } @@ -1053,7 +1053,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, } } sample_values.clear(); - dataset->Construct(&bin_mappers, Common::Vector2Ptr(&sample_indices).data(), + dataset->Construct(&bin_mappers, forced_bin_bounds, Common::Vector2Ptr(&sample_indices).data(), Common::VectorSize(sample_indices).data(), sample_data.size(), config_); } From 3e81b944d88934e7e639a1af68acc2cf6abeacc7 Mon Sep 17 00:00:00 2001 From: btrotta Date: Sat, 28 Sep 2019 22:15:51 +1000 Subject: [PATCH 49/49] Minor fixes. --- include/LightGBM/dataset.h | 2 +- src/io/dataset.cpp | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index dd52571efbc1..3d0ae990201d 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -290,7 +290,7 @@ class Dataset { void Construct( std::vector>* bin_mappers, - std::vector>& forced_bins, + const std::vector>& forced_bins, int** sample_non_zero_indices, const int* num_per_col, size_t total_sample_cnt, diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 7d74b4fa3f5b..54c8fcc22481 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -215,7 +215,7 @@ std::vector> FastFeatureBundling(const std::vector>* bin_mappers, - std::vector>& forced_bins, + const std::vector>& forced_bins, int** sample_non_zero_indices, const int* num_per_col, size_t total_sample_cnt, @@ -436,6 +436,7 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { group_feature_cnt_ = dataset->group_feature_cnt_; monotone_types_ = dataset->monotone_types_; feature_penalty_ = dataset->feature_penalty_; + forced_bin_bounds_ = dataset->forced_bin_bounds_; } void Dataset::CreateValid(const Dataset* dataset) { @@ -490,6 +491,7 @@ void Dataset::CreateValid(const Dataset* dataset) { } monotone_types_ = dataset->monotone_types_; feature_penalty_ = dataset->feature_penalty_; + forced_bin_bounds_ = dataset->forced_bin_bounds_; } void Dataset::ReSize(data_size_t num_data) {