Predefined bin thresholds (#2325)

* Fix bug where small values of max_bin cause crash. * Revert "Fix bug where small values of max_bin cause crash." This reverts commit fe5c8e2. * Add functionality to force bin thresholds. * Fix style issues. * Use stable sort. * Minor style and doc fixes. * Add functionality to force bin thresholds. * Fix style issues. * Use stable sort. * Minor style and doc fixes. * Change binning behavior to be same as PR #2342. * Add functionality to force bin thresholds. * Fix style issues. * Use stable sort. * Minor style and doc fixes. * Add functionality to force bin thresholds. * Fix style issues. * Use stable sort. * Minor style and doc fixes. * Change binning behavior to be same as PR #2342. * Add functionality to force bin thresholds. * Fix style issues. * Minor style and doc fixes. * Add functionality to force bin thresholds. * Fix style issues. * Minor style and doc fixes. * Change binning behavior to be same as PR #2342. * Add functionality to force bin thresholds. * Fix style issues. * Use stable sort. * Minor style and doc fixes. * Add functionality to force bin thresholds. * Fix style issues. * Use stable sort. * Minor style and doc fixes. * Change binning behavior to be same as PR #2342. * Use different bin finding function for predefined bounds. * Fix style issues. * Minor refactoring, overload FindBinWithZeroAsOneBin. * Fix style issues. * Fix bug and add new test. * Add warning when using categorical features with forced bins. * Pass forced_upper_bounds by reference. * Pass container types by const reference. * Get categorical features using FeatureBinMapper. * Fix bug for small max_bin. * Move GetForcedBins to DatasetLoader. * Find forced bins in dataset_loader. * Minor fixes.
microsoft · Sep 28, 2019 · cc7a1e2 · cc7a1e2
1 parent f2632a6
commit cc7a1e2
Show file tree

Hide file tree

Showing 11 changed files with 323 additions and 18 deletions.
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
@@ -414,6 +414,14 @@ Learning Control Parameters
 
    -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
 
+-  ``forcedbins_filename`` :raw-html:`<a id="forcedbins_filename" title="Permalink to this parameter" href="#forcedbins_filename">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string
+
+   -  path to a ``.json`` file that specifies bin upper bounds for some or all features
+
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
+
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
+
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
    -  decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees

diff --git a/examples/regression/train.conf b/examples/regression/train.conf
@@ -29,6 +29,9 @@ is_training_metric = true
 # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
 max_bin = 255
 
+# forced bin thresholds
+# forcedbins_filename = forced_bins.json
+
 # training data
 # if exsting weight file, should name to "regression.train.weight"
 # alias: train_data, train

diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
@@ -146,9 +146,10 @@ class BinMapper {
   * \param bin_type Type of this bin
   * \param use_missing True to enable missing value handle
   * \param zero_as_missing True to use zero as missing value
+  * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
   */
   void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
-               bool use_missing, bool zero_as_missing);
+               bool use_missing, bool zero_as_missing, const std::vector<double>& forced_upper_bounds);
 
   /*!
   * \brief Use specific number of bin to calculate the size of this class

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
@@ -412,6 +412,11 @@ struct Config {
   // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
   std::string forcedsplits_filename = "";
 
+  // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
+  std::string forcedbins_filename = "";
+
   // check = >=0.0
   // check = <=1.0
   // desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees

diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
@@ -290,6 +290,7 @@ class Dataset {
 
   void Construct(
     std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
+    const std::vector<std::vector<double>>& forced_bins,
     int** sample_non_zero_indices,
     const int* num_per_col,
     size_t total_sample_cnt,
@@ -630,6 +631,7 @@ class Dataset {
   bool is_finish_load_;
   int max_bin_;
   std::vector<int32_t> max_bin_by_feature_;
+  std::vector<std::vector<double>> forced_bin_bounds_;
   int bin_construct_sample_cnt_;
   int min_data_in_bin_;
   bool use_missing_;

diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h
@@ -36,6 +36,9 @@ class DatasetLoader {
   /*! \brief Disable copy */
   DatasetLoader(const DatasetLoader&) = delete;
 
+  static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features,
+                                                        const std::unordered_set<int>& categorical_features);
+
  private:
   Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
 

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
@@ -71,7 +71,7 @@ namespace LightGBM {
     return true;
   }
 
-  std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
+  std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts, 
     int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) {
     std::vector<double> bin_upper_bound;
     CHECK(max_bin > 0);
@@ -149,8 +149,105 @@ namespace LightGBM {
     return bin_upper_bound;
   }
 
-  std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
-    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
+  std::vector<double> FindBinWithPredefinedBin(const double* distinct_values, const int* counts,
+    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, const std::vector<double>& forced_upper_bounds) {
+    std::vector<double> bin_upper_bound;
+
+    // get list of distinct values
+    int left_cnt_data = 0;
+    int cnt_zero = 0;
+    int right_cnt_data = 0;
+    for (int i = 0; i < num_distinct_values; ++i) {
+      if (distinct_values[i] <= -kZeroThreshold) {
+        left_cnt_data += counts[i];
+      } else if (distinct_values[i] > kZeroThreshold) {
+        right_cnt_data += counts[i];
+      } else {
+        cnt_zero += counts[i];
+      }
+    }
+
+    // get number of positive and negative distinct values
+    int left_cnt = -1;
+    for (int i = 0; i < num_distinct_values; ++i) {
+      if (distinct_values[i] > -kZeroThreshold) {
+        left_cnt = i;
+        break;
+      }
+    }
+    if (left_cnt < 0) {
+      left_cnt = num_distinct_values;
+    }
+    int right_start = -1;
+    for (int i = left_cnt; i < num_distinct_values; ++i) {
+      if (distinct_values[i] > kZeroThreshold) {
+        right_start = i;
+        break;
+      }
+    }
+
+    // include zero bounds and infinity bound
+    if (max_bin == 2) {
+      if (left_cnt == 0) {
+        bin_upper_bound.push_back(kZeroThreshold);
+      } else {
+        bin_upper_bound.push_back(-kZeroThreshold);
+      }
+    } else if (max_bin >= 3) {
+      if (left_cnt > 0) {
+        bin_upper_bound.push_back(-kZeroThreshold);
+      }
+      if (right_start >= 0) {
+        bin_upper_bound.push_back(kZeroThreshold);
+      }
+    }
+    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
+
+    // add forced bounds, excluding zeros since we have already added zero bounds
+    int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
+    int num_inserted = 0;
+    for (size_t i = 0; i < forced_upper_bounds.size(); ++i) {
+      if (num_inserted >= max_to_insert) {
+        break;
+      }
+      if (std::fabs(forced_upper_bounds[i]) > kZeroThreshold) {
+        bin_upper_bound.push_back(forced_upper_bounds[i]);
+        ++num_inserted;
+      }
+    }
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
+
+    // find remaining bounds
+    int free_bins = max_bin - static_cast<int>(bin_upper_bound.size());
+    std::vector<double> bounds_to_add;
+    int value_ind = 0;
+    for (size_t i = 0; i < bin_upper_bound.size(); ++i) {
+      int cnt_in_bin = 0;
+      int distinct_cnt_in_bin = 0;
+      int bin_start = value_ind;
+      while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) {
+        cnt_in_bin += counts[value_ind];
+        ++distinct_cnt_in_bin;
+        ++value_ind;
+      }
+      int bins_remaining = max_bin - static_cast<int>(bin_upper_bound.size()) - static_cast<int>(bounds_to_add.size());
+      int num_sub_bins = static_cast<int>(std::lround((static_cast<double>(cnt_in_bin) * free_bins / total_sample_cnt)));
+      num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1;
+      if (i == bin_upper_bound.size() - 1) {
+        num_sub_bins = bins_remaining + 1;
+      }
+      std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin,
+        num_sub_bins, cnt_in_bin, min_data_in_bin);
+      bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
+    }
+    bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin));
+    return bin_upper_bound;
+  }
+
+  std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values, 
+    int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
     std::vector<double> bin_upper_bound;
     int left_cnt_data = 0;
     int cnt_zero = 0;
@@ -207,8 +304,19 @@ namespace LightGBM {
     return bin_upper_bound;
   }
 
+  std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values,
+    int max_bin, size_t total_sample_cnt, int min_data_in_bin, const std::vector<double>& forced_upper_bounds) {
+    if (forced_upper_bounds.empty()) {
+      return FindBinWithZeroAsOneBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+    } else {
+      return FindBinWithPredefinedBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin,
+                                      forced_upper_bounds);
+    }
+  }
+
   void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
-    int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
+    int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, 
+    const std::vector<double>& forced_upper_bounds) {
     int na_cnt = 0;
     int tmp_num_sample_values = 0;
     for (int i = 0; i < num_sample_values; ++i) {
@@ -276,14 +384,17 @@ namespace LightGBM {
     int num_distinct_values = static_cast<int>(distinct_values.size());
     if (bin_type_ == BinType::NumericalBin) {
       if (missing_type_ == MissingType::Zero) {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
+                                                   min_data_in_bin, forced_upper_bounds);
         if (bin_upper_bound_.size() == 2) {
           missing_type_ = MissingType::None;
         }
       } else if (missing_type_ == MissingType::None) {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
+                                                   min_data_in_bin, forced_upper_bounds);
       } else {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt,
+                                                   min_data_in_bin, forced_upper_bounds);
         bin_upper_bound_.push_back(NaN);
       }
       num_bin_ = static_cast<int>(bin_upper_bound_.size());

diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
@@ -215,6 +215,7 @@ std::unordered_set<std::string> Config::parameter_set({
   "monotone_constraints",
   "feature_contri",
   "forcedsplits_filename",
+  "forcedbins_filename",
   "refit_decay_rate",
   "cegb_tradeoff",
   "cegb_penalty_split",
@@ -406,6 +407,8 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetString(params, "forcedsplits_filename", &forcedsplits_filename);
 
+  GetString(params, "forcedbins_filename", &forcedbins_filename);
+
   GetDouble(params, "refit_decay_rate", &refit_decay_rate);
   CHECK(refit_decay_rate >=0.0);
   CHECK(refit_decay_rate <=1.0);
@@ -621,6 +624,7 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast<int8_t, int>(monotone_constraints), ",") << "]\n";
   str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n";
   str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
+  str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n";
   str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n";
   str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n";
   str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";

diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
@@ -15,6 +15,7 @@
 #include <sstream>
 #include <unordered_map>
 
+
 namespace LightGBM {
 
 const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";
@@ -214,6 +215,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
 
 void Dataset::Construct(
   std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
+  const std::vector<std::vector<double>>& forced_bins,
   int** sample_non_zero_indices,
   const int* num_per_col,
   size_t total_sample_cnt,
@@ -324,6 +326,7 @@ void Dataset::Construct(
     max_bin_by_feature_.resize(num_total_features_);
     max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end());
   }
+  forced_bin_bounds_ = forced_bins;
   max_bin_ = io_config.max_bin;
   min_data_in_bin_ = io_config.min_data_in_bin;
   bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
@@ -356,6 +359,9 @@ void Dataset::ResetConfig(const char* parameters) {
   if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
     Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
   }
+  if (param.count("forcedbins_filename")) {
+    Log::Warning("Cannot change forced bins after constructed Dataset handle.");
+  }
 
   if (!io_config.monotone_constraints.empty()) {
     CHECK(static_cast<size_t>(num_total_features_) == io_config.monotone_constraints.size());
@@ -430,6 +436,7 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
   group_feature_cnt_ = dataset->group_feature_cnt_;
   monotone_types_ = dataset->monotone_types_;
   feature_penalty_ = dataset->feature_penalty_;
+  forced_bin_bounds_ = dataset->forced_bin_bounds_;
 }
 
 void Dataset::CreateValid(const Dataset* dataset) {
@@ -484,6 +491,7 @@ void Dataset::CreateValid(const Dataset* dataset) {
   }
   monotone_types_ = dataset->monotone_types_;
   feature_penalty_ = dataset->feature_penalty_;
+  forced_bin_bounds_ = dataset->forced_bin_bounds_;
 }
 
 void Dataset::ReSize(data_size_t num_data) {
@@ -657,6 +665,10 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
     for (int i = 0; i < num_total_features_; ++i) {
       size_of_header += feature_names_[i].size() + sizeof(int);
     }
+    // size of forced bins
+    for (int i = 0; i < num_total_features_; ++i) {
+      size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int);
+    }
     writer->Write(&size_of_header, sizeof(size_of_header));
     // write header
     writer->Write(&num_data_, sizeof(num_data_));
@@ -705,6 +717,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
       const char* c_str = feature_names_[i].c_str();
       writer->Write(c_str, sizeof(char) * str_len);
     }
+    // write forced bins
+    for (int i = 0; i < num_total_features_; ++i) {
+      int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
+      writer->Write(&num_bounds, sizeof(int));
+
+      for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
+        writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
+      }
+    }
 
     // get size of meta data
     size_t size_of_metadata = metadata_.SizesInByte();
@@ -754,6 +775,13 @@ void Dataset::DumpTextFile(const char* text_filename) {
   for (auto n : feature_names_) {
     fprintf(file, "%s, ", n.c_str());
   }
+  fprintf(file, "\nforced_bins: ");
+  for (int i = 0; i < num_total_features_; ++i) {
+    fprintf(file, "\nfeature %d: ", i);
+    for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
+      fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
+    }
+  }
   std::vector<std::unique_ptr<BinIterator>> iterators;
   iterators.reserve(num_features_);
   for (int j = 0; j < num_features_; ++j) {
@@ -1005,6 +1033,7 @@ void Dataset::addFeaturesFrom(Dataset* other) {
   PushVector(&feature_names_, other->feature_names_);
   PushVector(&feature2subfeature_, other->feature2subfeature_);
   PushVector(&group_feature_cnt_, other->group_feature_cnt_);
+  PushVector(&forced_bin_bounds_, other->forced_bin_bounds_);
   feature_groups_.reserve(other->feature_groups_.size());
   for (auto& fg : other->feature_groups_) {
     feature_groups_.emplace_back(new FeatureGroup(*fg));
@@ -1027,6 +1056,7 @@ void Dataset::addFeaturesFrom(Dataset* other) {
 
   PushClearIfEmpty(&monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
   PushClearIfEmpty(&feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);
+  PushClearIfEmpty(&max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1);
 
   num_features_ += other->num_features_;
   num_total_features_ += other->num_total_features_;