From fe5c8e2547057c1fa5750bcddd359dd7708fab4b Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 31 Jul 2019 22:47:39 +1000
Subject: [PATCH 01/49] Fix bug where small values of max_bin cause crash.

---
 src/io/bin.cpp                           | 23 +++++++++++++++++------
 tests/python_package_test/test_engine.py | 19 +++++++++++++++++++
 2 files changed, 36 insertions(+), 6 deletions(-)
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 617bdf5bac73..d77a73ef9336 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -177,11 +177,10 @@ namespace LightGBM {
       left_cnt = num_distinct_values;
     }
 
-    if (left_cnt > 0) {
+    if ((left_cnt > 0) && (max_bin > 1)) {
       int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
       left_max_bin = std::max(1, left_max_bin);
       bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
-      bin_upper_bound.back() = -kZeroThreshold;
     }
 
     int right_start = -1;
@@ -192,16 +191,27 @@ namespace LightGBM {
       }
     }
 
-    if (right_start >= 0) {
-      int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
-      CHECK(right_max_bin > 0);
+    if (bin_upper_bound.size() == 0) {
+      if (max_bin > 1) {
+        bin_upper_bound.push_back(kZeroThreshold);
+      }
+    } else {
+      bin_upper_bound.back() = -kZeroThreshold;
+      if (max_bin > 2) {
+        // create zero bin
+        bin_upper_bound.push_back(kZeroThreshold);
+      }
+    }
+
+    int right_max_bin = max_bin - static_cast<int>(bin_upper_bound.size());
+    if ((right_start >= 0) && (right_max_bin > 0)) {
       auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
         num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
-      bin_upper_bound.push_back(kZeroThreshold);
       bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
     } else {
       bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
     }
+    CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }
 
@@ -280,6 +290,7 @@ namespace LightGBM {
         }
       } else if (missing_type_ == MissingType::None) {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+
       } else {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
         bin_upper_bound_.push_back(NaN);
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 4c9a9eddc6c6..475cfebbb8c0 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -901,6 +901,25 @@ def test_max_bin_by_feature(self):
         est = lgb.train(params, lgb_data, num_boost_round=1)
         self.assertEqual(len(np.unique(est.predict(X))), 3)
 
+    def test_small_max_bin(self):
+        np.random.seed(0)
+        y = np.random.choice([0, 1], 100)
+        x = np.zeros((100, 1))
+        x[:30, 0] = -1
+        x[30:60, 0] = 1
+        x[60:, 0] = 2
+        params = {'objective': 'binary',
+                  'seed': 0,
+                  'min_data_in_leaf': 1,
+                  'verbose': -1,
+                  'max_bin': 2}
+        lgb_x = lgb.Dataset(x, label=y)
+        est = lgb.train(params, lgb_x, num_boost_round=5)
+        x[0, 0] = np.nan
+        params['max_bin'] = 3
+        lgb_x = lgb.Dataset(x, label=y)
+        est = lgb.train(params, lgb_x, num_boost_round=5)
+
     def test_refit(self):
         X, y = load_breast_cancer(True)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

From 439bcfd0600ae795630b8303ce8e19bc1fc90378 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 31 Jul 2019 22:53:57 +1000
Subject: [PATCH 02/49] Revert "Fix bug where small values of max_bin cause
 crash."

This reverts commit fe5c8e2547057c1fa5750bcddd359dd7708fab4b.
---
 src/io/bin.cpp                           | 23 ++++++-----------------
 tests/python_package_test/test_engine.py | 19 -------------------
 2 files changed, 6 insertions(+), 36 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index d77a73ef9336..617bdf5bac73 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -177,10 +177,11 @@ namespace LightGBM {
       left_cnt = num_distinct_values;
     }
 
-    if ((left_cnt > 0) && (max_bin > 1)) {
+    if (left_cnt > 0) {
       int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
       left_max_bin = std::max(1, left_max_bin);
       bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
+      bin_upper_bound.back() = -kZeroThreshold;
     }
 
     int right_start = -1;
@@ -191,27 +192,16 @@ namespace LightGBM {
       }
     }
 
-    if (bin_upper_bound.size() == 0) {
-      if (max_bin > 1) {
-        bin_upper_bound.push_back(kZeroThreshold);
-      }
-    } else {
-      bin_upper_bound.back() = -kZeroThreshold;
-      if (max_bin > 2) {
-        // create zero bin
-        bin_upper_bound.push_back(kZeroThreshold);
-      }
-    }
-
-    int right_max_bin = max_bin - static_cast<int>(bin_upper_bound.size());
-    if ((right_start >= 0) && (right_max_bin > 0)) {
+    if (right_start >= 0) {
+      int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
+      CHECK(right_max_bin > 0);
       auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
         num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
+      bin_upper_bound.push_back(kZeroThreshold);
       bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
     } else {
       bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
     }
-    CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }
 
@@ -290,7 +280,6 @@ namespace LightGBM {
         }
       } else if (missing_type_ == MissingType::None) {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
-
       } else {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
         bin_upper_bound_.push_back(NaN);
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 475cfebbb8c0..4c9a9eddc6c6 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -901,25 +901,6 @@ def test_max_bin_by_feature(self):
         est = lgb.train(params, lgb_data, num_boost_round=1)
         self.assertEqual(len(np.unique(est.predict(X))), 3)
 
-    def test_small_max_bin(self):
-        np.random.seed(0)
-        y = np.random.choice([0, 1], 100)
-        x = np.zeros((100, 1))
-        x[:30, 0] = -1
-        x[30:60, 0] = 1
-        x[60:, 0] = 2
-        params = {'objective': 'binary',
-                  'seed': 0,
-                  'min_data_in_leaf': 1,
-                  'verbose': -1,
-                  'max_bin': 2}
-        lgb_x = lgb.Dataset(x, label=y)
-        est = lgb.train(params, lgb_x, num_boost_round=5)
-        x[0, 0] = np.nan
-        params['max_bin'] = 3
-        lgb_x = lgb.Dataset(x, label=y)
-        est = lgb.train(params, lgb_x, num_boost_round=5)
-
     def test_refit(self):
         X, y = load_breast_cancer(True)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

From 34e72c87c6d610b4fbf20e30a8bcf01989963a5e Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 13 Aug 2019 18:14:54 +1000
Subject: [PATCH 03/49] Add functionality to force bin thresholds.

---
 docs/Parameters.rst                      |  8 +++
 include/LightGBM/bin.h                   |  3 +-
 include/LightGBM/config.h                |  5 ++
 include/LightGBM/dataset.h               |  3 +
 src/io/bin.cpp                           | 86 ++++++++++++++++++------
 src/io/config_auto.cpp                   |  4 ++
 src/io/dataset.cpp                       | 64 +++++++++++++++++-
 src/io/dataset_loader.cpp                | 46 ++++++++++---
 tests/data/forced_bins.json              | 10 +++
 tests/python_package_test/test_engine.py | 32 ++++++++-
 10 files changed, 227 insertions(+), 34 deletions(-)
 create mode 100644 tests/data/forced_bins.json

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 93c241bce215..584237464fd1 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -404,6 +404,14 @@ Learning Control Parameters
 
    -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
 
+-  ``forcedbins_filename`` :raw-html:`<a id="forcedbins_filename" title="Permalink to this parameter" href="#forcedbins_filename">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string
+
+   -  path to a ``.json`` file that specifies bin upper bounds for some or all features
+
+   -  ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
    -  decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index 46baee58fc46..1c5f62cd1907 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -146,9 +146,10 @@ class BinMapper {
   * \param bin_type Type of this bin
   * \param use_missing True to enable missing value handle
   * \param zero_as_missing True to use zero as missing value
+  * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
   */
   void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
-               bool use_missing, bool zero_as_missing);
+               bool use_missing, bool zero_as_missing, std::vector<double> forced_upper_bounds);
 
   /*!
   * \brief Use specific number of bin to calculate the size of this class
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 08b2a7352c0a..1c0c14f69508 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -402,6 +402,11 @@ struct Config {
   // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
   std::string forcedsplits_filename = "";
 
+  // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
+  // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+  std::string forcedbins_filename = "";
+
   // check = >=0.0
   // check = <=1.0
   // desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index e688522fbb1a..900487eafbf4 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -596,6 +596,8 @@ class Dataset {
 
   void addFeaturesFrom(Dataset* other);
 
+  static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features);
+
  private:
   std::string data_filename_;
   /*! \brief Store used features */
@@ -630,6 +632,7 @@ class Dataset {
   bool is_finish_load_;
   int max_bin_;
   std::vector<int32_t> max_bin_by_feature_;
+  std::vector<std::vector<double>> forced_bin_bounds_;
   int bin_construct_sample_cnt_;
   int min_data_in_bin_;
   bool use_missing_;
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 617bdf5bac73..62713d1bddd3 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -150,8 +150,10 @@ namespace LightGBM {
   }
 
   std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
-    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
+    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double> forced_upper_bounds) {
     std::vector<double> bin_upper_bound;
+
+    // get list of distinct values
     int left_cnt_data = 0;
     int cnt_zero = 0;
     int right_cnt_data = 0;
@@ -165,6 +167,7 @@ namespace LightGBM {
       }
     }
 
+    // get number of positive and negative distinct values
     int left_cnt = -1;
     for (int i = 0; i < num_distinct_values; ++i) {
       if (distinct_values[i] > -kZeroThreshold) {
@@ -172,18 +175,9 @@ namespace LightGBM {
         break;
       }
     }
-
     if (left_cnt < 0) {
       left_cnt = num_distinct_values;
     }
-
-    if (left_cnt > 0) {
-      int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
-      left_max_bin = std::max(1, left_max_bin);
-      bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
-      bin_upper_bound.back() = -kZeroThreshold;
-    }
-
     int right_start = -1;
     for (int i = left_cnt; i < num_distinct_values; ++i) {
       if (distinct_values[i] > kZeroThreshold) {
@@ -192,21 +186,66 @@ namespace LightGBM {
       }
     }
 
-    if (right_start >= 0) {
-      int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
-      CHECK(right_max_bin > 0);
-      auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
-        num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
+    // include zero bounds if possible
+    if (max_bin == 2) {
+      if (left_cnt == 0) {
+        bin_upper_bound.push_back(kZeroThreshold);
+      } else {
+        bin_upper_bound.push_back(-kZeroThreshold);
+      }
+    } else if (max_bin >= 3) {
+      bin_upper_bound.push_back(-kZeroThreshold);
       bin_upper_bound.push_back(kZeroThreshold);
-      bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
-    } else {
-      bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
     }
+    
+    // add forced bounds, excluding zeros since we have already added zero bounds
+    int i = 0;
+    while (i < forced_upper_bounds.size()) {
+      if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) {
+        forced_upper_bounds.erase(forced_upper_bounds.begin() + i);
+      } else {
+        ++i;
+      }
+    }
+    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
+    int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
+    int num_to_insert = std::min(max_to_insert, static_cast<int>(forced_upper_bounds.size()));
+    if (num_to_insert > 0) {
+      bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
+    }
+    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+
+    // find remaining bounds
+    std::vector<double> bounds_to_add;
+    int value_ind = 0;
+    for (int i = 0; i < bin_upper_bound.size(); ++i) {
+      int cnt_in_bin = 0;
+      int distinct_cnt_in_bin = 0;
+      int bin_start = value_ind;
+      while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) {
+        cnt_in_bin += counts[value_ind];
+        ++distinct_cnt_in_bin;
+        ++value_ind;
+      }
+      int bins_remaining = max_bin - static_cast<int>(bin_upper_bound.size()) - static_cast<int>(bounds_to_add.size());
+      int num_sub_bins = static_cast<int>(std::lround((static_cast<double>(cnt_in_bin) * bins_remaining / total_sample_cnt)));
+      num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1;
+      if (i == bin_upper_bound.size() - 1) {
+        num_sub_bins = bins_remaining + 1;
+      }
+      std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, 
+                                                            num_sub_bins, cnt_in_bin, min_data_in_bin);
+      bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
+    }
+    bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
+    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }
 
   void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
-    int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
+    int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, 
+    std::vector<double> forced_upper_bounds) {
     int na_cnt = 0;
     int tmp_num_sample_values = 0;
     for (int i = 0; i < num_sample_values; ++i) {
@@ -274,14 +313,17 @@ namespace LightGBM {
     int num_distinct_values = static_cast<int>(distinct_values.size());
     if (bin_type_ == BinType::NumericalBin) {
       if (missing_type_ == MissingType::Zero) {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
+                                                   min_data_in_bin, forced_upper_bounds);
         if (bin_upper_bound_.size() == 2) {
           missing_type_ = MissingType::None;
         }
       } else if (missing_type_ == MissingType::None) {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
+                                                  min_data_in_bin, forced_upper_bounds);
       } else {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
+                                                   min_data_in_bin, forced_upper_bounds);
         bin_upper_bound_.push_back(NaN);
       }
       num_bin_ = static_cast<int>(bin_upper_bound_.size());
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 8d75b1cde3df..ad5b43811ebe 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -211,6 +211,7 @@ std::unordered_set<std::string> Config::parameter_set({
   "monotone_constraints",
   "feature_contri",
   "forcedsplits_filename",
+  "forcedbins_filename",
   "refit_decay_rate",
   "cegb_tradeoff",
   "cegb_penalty_split",
@@ -396,6 +397,8 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetString(params, "forcedsplits_filename", &forcedsplits_filename);
 
+  GetString(params, "forcedbins_filename", &forcedbins_filename);
+
   GetDouble(params, "refit_decay_rate", &refit_decay_rate);
   CHECK(refit_decay_rate >=0.0);
   CHECK(refit_decay_rate <=1.0);
@@ -608,6 +611,7 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast<int8_t, int>(monotone_constraints), ",") << "]\n";
   str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n";
   str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
+  str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n";
   str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n";
   str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n";
   str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index f201a40a1a7a..c931e945cd24 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -8,12 +8,17 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
+#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
 #include <cstdio>
 #include <sstream>
 #include <unordered_map>
+#include <fstream>
+
+using namespace json11;
+
 
 namespace LightGBM {
 
@@ -324,6 +329,7 @@ void Dataset::Construct(
     max_bin_by_feature_.resize(num_total_features_);
     max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end());
   }
+  forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_);
   max_bin_ = io_config.max_bin;
   min_data_in_bin_ = io_config.min_data_in_bin;
   bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
@@ -356,6 +362,12 @@ void Dataset::ResetConfig(const char* parameters) {
   if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
     Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
   }
+  if (param.count("forcedbins_filename")) {
+    std::vector<std::vector<double>> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_);
+    if (config_bounds != forced_bin_bounds_) {
+      Log::Warning("Cannot change forced bins after constructed Dataset handle.");
+    }
+  }
 
   if (!io_config.monotone_constraints.empty()) {
     CHECK(static_cast<size_t>(num_total_features_) == io_config.monotone_constraints.size());
@@ -657,6 +669,10 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
     for (int i = 0; i < num_total_features_; ++i) {
       size_of_header += feature_names_[i].size() + sizeof(int);
     }
+    // size of forced bins
+    for (int i = 0; i < num_total_features_; ++i) {
+      size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int);
+    }
     writer->Write(&size_of_header, sizeof(size_of_header));
     // write header
     writer->Write(&num_data_, sizeof(num_data_));
@@ -705,6 +721,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
       const char* c_str = feature_names_[i].c_str();
       writer->Write(c_str, sizeof(char) * str_len);
     }
+    // write forced bins
+    for (int i = 0; i < num_total_features_; ++i) {
+      int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
+      writer->Write(&num_bounds, sizeof(int));
+      
+      for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) {
+        writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
+      }
+    }
 
     // get size of meta data
     size_t size_of_metadata = metadata_.SizesInByte();
@@ -754,6 +779,13 @@ void Dataset::DumpTextFile(const char* text_filename) {
   for (auto n : feature_names_) {
     fprintf(file, "%s, ", n.c_str());
   }
+  fprintf(file, "\nforced_bins: ");
+  for (int i = 0; i < num_total_features_; ++i) {
+    fprintf(file, "\nfeature %d: ", i);
+    for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) {
+      fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
+    }
+  }
   std::vector<std::unique_ptr<BinIterator>> iterators;
   iterators.reserve(num_features_);
   for (int j = 0; j < num_features_; ++j) {
@@ -1005,6 +1037,7 @@ void Dataset::addFeaturesFrom(Dataset* other) {
   PushVector(feature_names_, other->feature_names_);
   PushVector(feature2subfeature_, other->feature2subfeature_);
   PushVector(group_feature_cnt_, other->group_feature_cnt_);
+  PushVector(forced_bin_bounds_, other->forced_bin_bounds_);
   feature_groups_.reserve(other->feature_groups_.size());
   for (auto& fg : other->feature_groups_) {
     feature_groups_.emplace_back(new FeatureGroup(*fg));
@@ -1027,10 +1060,39 @@ void Dataset::addFeaturesFrom(Dataset* other) {
 
   PushClearIfEmpty(monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
   PushClearIfEmpty(feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);
-
+  PushClearIfEmpty(max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1);
   num_features_ += other->num_features_;
   num_total_features_ += other->num_total_features_;
   num_groups_ += other->num_groups_;
 }
 
+
+std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features) {
+  std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
+  if (forced_bins_path != "") {
+    std::ifstream forced_bins_stream(forced_bins_path.c_str());
+    std::stringstream buffer;
+    buffer << forced_bins_stream.rdbuf();
+    std::string err;
+    Json forced_bins_json = Json::parse(buffer.str(), err);
+    CHECK(forced_bins_json.is_array());
+    std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+    for (int i = 0; i < forced_bins_arr.size(); ++i) {
+      int feature_num = forced_bins_arr[i]["feature"].int_value();
+      CHECK(feature_num < num_total_features);
+      std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+      for (int j = 0; j < bounds_arr.size(); ++j) {
+        forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+      }
+    }
+    // remove duplicates
+    for (int i = 0; i < num_total_features; ++i) {
+      auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+      forced_bins[i].erase(new_end, forced_bins[i].end());
+    }
+  }
+  return forced_bins;
+}
+
+
 }  // namespace LightGBM
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 1130d803ea36..f36d5b1df27d 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -3,7 +3,6 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #include <LightGBM/dataset_loader.h>
-
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>
@@ -458,6 +457,21 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
     }
     dataset->feature_names_.emplace_back(str_buf.str());
   }
+  // get forced_bin_bounds_
+  dataset->forced_bin_bounds_ = std::vector<std::vector<double>>(dataset->num_total_features_, std::vector<double>());
+  for (int i = 0; i < dataset->num_total_features_; ++i) {
+    int num_bounds = *(reinterpret_cast<const int*>(mem_ptr));
+    mem_ptr += sizeof(int);
+    dataset->forced_bin_bounds_[i] = std::vector<double>();
+    const double* tmp_ptr_forced_bounds = reinterpret_cast<const double*>(mem_ptr);
+    
+    for (int j = 0; j < num_bounds; ++j) {
+      double bound = tmp_ptr_forced_bounds[j];
+      dataset->forced_bin_bounds_[i].push_back(bound);
+    }
+    mem_ptr += num_bounds * sizeof(double);
+   
+  }
 
   // read size of meta data
   read_cnt = reader->Read(buffer.data(), sizeof(size_t));
@@ -549,6 +563,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
   return dataset.release();
 }
 
+
 Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
                                                int** sample_indices, int num_col, const int* num_per_col,
                                                size_t total_sample_size, data_size_t num_data) {
@@ -565,6 +580,11 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
     CHECK(static_cast<size_t>(num_col) == config_.max_bin_by_feature.size());
     CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
   }
+
+  // get forced split
+  std::string forced_bins_path = config_.forcedbins_filename;
+  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col);
+
   const data_size_t filter_cnt = static_cast<data_size_t>(
     static_cast<double>(config_.min_data_in_leaf * total_sample_size) / num_data);
   if (Network::num_machines() == 1) {
@@ -585,12 +605,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
       if (config_.max_bin_by_feature.empty()) {
         bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
                                 config_.max_bin, config_.min_data_in_bin, filter_cnt,
-                                bin_type, config_.use_missing, config_.zero_as_missing);
+                                bin_type, config_.use_missing, config_.zero_as_missing,
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
                                 config_.max_bin_by_feature[i], config_.min_data_in_bin,
                                 filter_cnt, bin_type, config_.use_missing,
-                                config_.zero_as_missing);
+                                config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
@@ -630,12 +651,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
       if (config_.max_bin_by_feature.empty()) {
         bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
                                 total_sample_size, config_.max_bin, config_.min_data_in_bin,
-                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
+                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, 
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
                                 total_sample_size, config_.max_bin_by_feature[start[rank] + i],
                                 config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
-                                config_.zero_as_missing);
+                                config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
@@ -872,6 +894,10 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
     CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
   }
 
+  // get forced split
+  std::string forced_bins_path = config_.forcedbins_filename;
+  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_);
+
   // check the range of label_idx, weight_idx and group_idx
   CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_);
   CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_);
@@ -909,12 +935,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
       if (config_.max_bin_by_feature.empty()) {
         bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
                                 sample_data.size(), config_.max_bin, config_.min_data_in_bin,
-                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
+                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing,
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
                                 sample_data.size(), config_.max_bin_by_feature[i],
                                 config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
-                                config_.zero_as_missing);
+                                config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
@@ -955,13 +982,14 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
         bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
                                 static_cast<int>(sample_values[start[rank] + i].size()),
                                 sample_data.size(), config_.max_bin, config_.min_data_in_bin,
-                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
+                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, 
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
                                 static_cast<int>(sample_values[start[rank] + i].size()),
                                 sample_data.size(), config_.max_bin_by_feature[i],
                                 config_.min_data_in_bin, filter_cnt, bin_type,
-                                config_.use_missing, config_.zero_as_missing);
+                                config_.use_missing, config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json
new file mode 100644
index 000000000000..aa74c36ffb78
--- /dev/null
+++ b/tests/data/forced_bins.json
@@ -0,0 +1,10 @@
+[
+    {
+        "feature": 0,
+        "bin_upper_bound": [ 0.3, 0.35, 0.4 ]
+    },
+    {
+        "feature": 1,
+        "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
+    }
+]
\ No newline at end of file
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 4c9a9eddc6c6..59ea0113f50a 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -895,7 +895,7 @@ def test_max_bin_by_feature(self):
         }
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)
-        self.assertEqual(len(np.unique(est.predict(X))), 100)
+        self.assertEqual(len(np.unique(est.predict(X))), 99)
         params['max_bin_by_feature'] = [2, 100]
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)
@@ -1544,3 +1544,33 @@ def constant_metric(preds, train_data):
                                                          decreasing_metric(preds, train_data)],
                         early_stopping_rounds=5, verbose_eval=False)
         self.assertEqual(gbm.best_iteration, 1)
+
+    def test_forced_bins(self):
+        x = np.zeros((100, 2))
+        x[:, 0] = np.arange(0, 1, 0.01)
+        x[:, 1] = -np.arange(0, 1, 0.01)
+        y = np.arange(0, 1, 0.01)
+        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json')
+        params = {'objective': 'regression_l1',
+                  'max_bin': 6,
+                  'forcedbins_filename': forcedbins_filename,
+                  'num_leaves': 2,
+                  'min_data_in_leaf': 1,
+                  'verbose': -1,
+                  'seed': 0}
+        lgb_x = lgb.Dataset(x, label=y)
+        est = lgb.train(params, lgb_x, num_boost_round=100)
+        new_x = np.zeros((3, x.shape[1]))
+        new_x[:, 0] = [0.31, 0.37, 0.41]
+        new_x[:, 1] = [0, 0, 0]
+        predicted = est.predict(new_x)
+        self.assertEqual(len(np.unique(predicted)), 3)
+        new_x[:, 0] = [0, 0, 0]
+        new_x[:, 1] = [-0.25, -0.5, -0.9]
+        predicted = est.predict(new_x)
+        self.assertEqual(len(np.unique(predicted)), 1)
+        params['forcedbins_filename'] = ''
+        lgb_x = lgb.Dataset(x, label=y)
+        est = lgb.train(params, lgb_x, num_boost_round=100)
+        predicted = est.predict(new_x)
+        self.assertEqual(len(np.unique(predicted)), 3)

From 5b21573ecb4dd9e463e47783b9a8309f000c6bf2 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:10:21 +1000
Subject: [PATCH 04/49] Fix style issues.

---
 docs/Parameters.rst                           |  4 +-
 .../regression}/forced_bins.json              |  2 +-
 examples/regression/train.conf                |  3 ++
 include/LightGBM/config.h                     |  4 +-
 src/io/bin.cpp                                |  2 +-
 src/io/dataset.cpp                            | 40 ++++++++++---------
 src/io/dataset_loader.cpp                     |  1 +
 tests/python_package_test/test_engine.py      |  3 +-
 8 files changed, 34 insertions(+), 25 deletions(-)
 rename {tests/data => examples/regression}/forced_bins.json (98%)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 584237464fd1..83a04b992393 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -408,9 +408,9 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
 
-   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
 
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
diff --git a/tests/data/forced_bins.json b/examples/regression/forced_bins.json
similarity index 98%
rename from tests/data/forced_bins.json
rename to examples/regression/forced_bins.json
index aa74c36ffb78..1ee0a49d727c 100644
--- a/tests/data/forced_bins.json
+++ b/examples/regression/forced_bins.json
@@ -7,4 +7,4 @@
         "feature": 1,
         "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
     }
-]
\ No newline at end of file
+]
diff --git a/examples/regression/train.conf b/examples/regression/train.conf
index 11396c23ecc2..4c73169dc8f9 100644
--- a/examples/regression/train.conf
+++ b/examples/regression/train.conf
@@ -29,6 +29,9 @@ is_training_metric = true
 # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
 max_bin = 255
 
+# forced bin thresholds
+# forcedbins_filename = forced_bins.json
+
 # training data
 # if exsting weight file, should name to "regression.train.weight"
 # alias: train_data, train
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 1c0c14f69508..89fa57453c88 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -403,8 +403,8 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
-  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
   // check = >=0.0
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 62713d1bddd3..2556a59b4715 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -320,7 +320,7 @@ namespace LightGBM {
         }
       } else if (missing_type_ == MissingType::None) {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
-                                                  min_data_in_bin, forced_upper_bounds);
+                                                   min_data_in_bin, forced_upper_bounds);
       } else {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
                                                    min_data_in_bin, forced_upper_bounds);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index c931e945cd24..269c06c4c37d 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -5,10 +5,10 @@
 #include <LightGBM/dataset.h>
 
 #include <LightGBM/feature_group.h>
+#include <LightGBM/json11.hpp>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
-#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
@@ -1071,24 +1071,28 @@ std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_
   std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
   if (forced_bins_path != "") {
     std::ifstream forced_bins_stream(forced_bins_path.c_str());
-    std::stringstream buffer;
-    buffer << forced_bins_stream.rdbuf();
-    std::string err;
-    Json forced_bins_json = Json::parse(buffer.str(), err);
-    CHECK(forced_bins_json.is_array());
-    std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
-    for (int i = 0; i < forced_bins_arr.size(); ++i) {
-      int feature_num = forced_bins_arr[i]["feature"].int_value();
-      CHECK(feature_num < num_total_features);
-      std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
-      for (int j = 0; j < bounds_arr.size(); ++j) {
-        forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+    if (forced_bins_stream.fail()) {
+      Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str());
+    } else {
+      std::stringstream buffer;
+      buffer << forced_bins_stream.rdbuf();
+      std::string err;
+      Json forced_bins_json = Json::parse(buffer.str(), err);
+      CHECK(forced_bins_json.is_array());
+      std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+      for (int i = 0; i < forced_bins_arr.size(); ++i) {
+        int feature_num = forced_bins_arr[i]["feature"].int_value();
+        CHECK(feature_num < num_total_features);
+        std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+        for (int j = 0; j < bounds_arr.size(); ++j) {
+          forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+        }
+      }
+      // remove duplicates
+      for (int i = 0; i < num_total_features; ++i) {
+        auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+        forced_bins[i].erase(new_end, forced_bins[i].end());
       }
-    }
-    // remove duplicates
-    for (int i = 0; i < num_total_features; ++i) {
-      auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
-      forced_bins[i].erase(new_end, forced_bins[i].end());
     }
   }
   return forced_bins;
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index f36d5b1df27d..eb83d74bfe3d 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,6 +2,7 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
 #include <LightGBM/dataset_loader.h>
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 59ea0113f50a..d55bac7711a1 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1550,7 +1550,8 @@ def test_forced_bins(self):
         x[:, 0] = np.arange(0, 1, 0.01)
         x[:, 1] = -np.arange(0, 1, 0.01)
         y = np.arange(0, 1, 0.01)
-        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json')
+        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                           '../../examples/regression/forced_bins.json')
         params = {'objective': 'regression_l1',
                   'max_bin': 6,
                   'forcedbins_filename': forcedbins_filename,

From 2be599af63dbe4d750c12cdad1737fae4628c64d Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:19:58 +1000
Subject: [PATCH 05/49] Use stable sort.

---
 src/io/bin.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 2556a59b4715..b26a6a461e3e 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -213,7 +213,7 @@ namespace LightGBM {
     if (num_to_insert > 0) {
       bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
     }
-    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
 
     // find remaining bounds
     std::vector<double> bounds_to_add;
@@ -238,7 +238,7 @@ namespace LightGBM {
       bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
-    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
     CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }

From 6a098f0f432db9371fb445357f4a92543490a5cb Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Thu, 15 Aug 2019 19:17:19 +1000
Subject: [PATCH 06/49] Minor style and doc fixes.

---
 docs/Parameters.rst       | 2 +-
 include/LightGBM/config.h | 2 +-
 src/io/dataset_loader.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 83a04b992393..d6f8a2a8e118 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -408,7 +408,7 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
 
    -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 89fa57453c88..0a621f0036d0 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -403,7 +403,7 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
   // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index eb83d74bfe3d..c00b9b7fdae5 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,8 +2,8 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
 #include <LightGBM/dataset_loader.h>
+
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>

From 8f736369106564377bd02e496b31e16a5e894797 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 13 Aug 2019 18:14:54 +1000
Subject: [PATCH 07/49] Add functionality to force bin thresholds.

---
 docs/Parameters.rst                      |  8 ++
 include/LightGBM/bin.h                   |  3 +-
 include/LightGBM/config.h                |  5 ++
 include/LightGBM/dataset.h               |  3 +
 src/io/bin.cpp                           | 93 +++++++++++++++---------
 src/io/config_auto.cpp                   |  4 +
 src/io/dataset.cpp                       | 64 +++++++++++++++-
 src/io/dataset_loader.cpp                | 46 +++++++++---
 tests/data/forced_bins.json              | 10 +++
 tests/python_package_test/test_engine.py | 30 ++++++++
 10 files changed, 222 insertions(+), 44 deletions(-)
 create mode 100644 tests/data/forced_bins.json

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 8c16e190d223..10105bfbed5a 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -404,6 +404,14 @@ Learning Control Parameters
 
    -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
 
+-  ``forcedbins_filename`` :raw-html:`<a id="forcedbins_filename" title="Permalink to this parameter" href="#forcedbins_filename">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string
+
+   -  path to a ``.json`` file that specifies bin upper bounds for some or all features
+
+   -  ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
    -  decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index 46baee58fc46..1c5f62cd1907 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -146,9 +146,10 @@ class BinMapper {
   * \param bin_type Type of this bin
   * \param use_missing True to enable missing value handle
   * \param zero_as_missing True to use zero as missing value
+  * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
   */
   void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
-               bool use_missing, bool zero_as_missing);
+               bool use_missing, bool zero_as_missing, std::vector<double> forced_upper_bounds);
 
   /*!
   * \brief Use specific number of bin to calculate the size of this class
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 190e239cf5a7..d2a953ddb416 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -402,6 +402,11 @@ struct Config {
   // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
   std::string forcedsplits_filename = "";
 
+  // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
+  // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+  std::string forcedbins_filename = "";
+
   // check = >=0.0
   // check = <=1.0
   // desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index e688522fbb1a..900487eafbf4 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -596,6 +596,8 @@ class Dataset {
 
   void addFeaturesFrom(Dataset* other);
 
+  static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features);
+
  private:
   std::string data_filename_;
   /*! \brief Store used features */
@@ -630,6 +632,7 @@ class Dataset {
   bool is_finish_load_;
   int max_bin_;
   std::vector<int32_t> max_bin_by_feature_;
+  std::vector<std::vector<double>> forced_bin_bounds_;
   int bin_construct_sample_cnt_;
   int min_data_in_bin_;
   bool use_missing_;
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 9b105e282923..62713d1bddd3 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -150,8 +150,10 @@ namespace LightGBM {
   }
 
   std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
-    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
+    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double> forced_upper_bounds) {
     std::vector<double> bin_upper_bound;
+
+    // get list of distinct values
     int left_cnt_data = 0;
     int cnt_zero = 0;
     int right_cnt_data = 0;
@@ -165,6 +167,7 @@ namespace LightGBM {
       }
     }
 
+    // get number of positive and negative distinct values
     int left_cnt = -1;
     for (int i = 0; i < num_distinct_values; ++i) {
       if (distinct_values[i] > -kZeroThreshold) {
@@ -172,17 +175,9 @@ namespace LightGBM {
         break;
       }
     }
-
     if (left_cnt < 0) {
       left_cnt = num_distinct_values;
     }
-
-    if ((left_cnt > 0) && (max_bin > 1)) {
-      int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
-      left_max_bin = std::max(1, left_max_bin);
-      bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
-    }
-
     int right_start = -1;
     for (int i = left_cnt; i < num_distinct_values; ++i) {
       if (distinct_values[i] > kZeroThreshold) {
@@ -191,37 +186,66 @@ namespace LightGBM {
       }
     }
 
-    if (bin_upper_bound.size() == 0) {
-      if (max_bin > 2) {
-        // create zero bin
-        bin_upper_bound.push_back(-kZeroThreshold);
-        bin_upper_bound.push_back(kZeroThreshold);
-      }
-      else if (max_bin > 1) {
+    // include zero bounds if possible
+    if (max_bin == 2) {
+      if (left_cnt == 0) {
         bin_upper_bound.push_back(kZeroThreshold);
+      } else {
+        bin_upper_bound.push_back(-kZeroThreshold);
       }
-    } else {
-      bin_upper_bound.back() = -kZeroThreshold;
-      if (max_bin > 2) {
-        // create zero bin
-        bin_upper_bound.push_back(kZeroThreshold);
+    } else if (max_bin >= 3) {
+      bin_upper_bound.push_back(-kZeroThreshold);
+      bin_upper_bound.push_back(kZeroThreshold);
+    }
+    
+    // add forced bounds, excluding zeros since we have already added zero bounds
+    int i = 0;
+    while (i < forced_upper_bounds.size()) {
+      if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) {
+        forced_upper_bounds.erase(forced_upper_bounds.begin() + i);
+      } else {
+        ++i;
       }
     }
-
-    int right_max_bin = max_bin - static_cast<int>(bin_upper_bound.size());
-    if ((right_start >= 0) && (right_max_bin > 0)) {
-      auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
-        num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
-      bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
-    } else {
-      bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
+    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
+    int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
+    int num_to_insert = std::min(max_to_insert, static_cast<int>(forced_upper_bounds.size()));
+    if (num_to_insert > 0) {
+      bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
+    }
+    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+
+    // find remaining bounds
+    std::vector<double> bounds_to_add;
+    int value_ind = 0;
+    for (int i = 0; i < bin_upper_bound.size(); ++i) {
+      int cnt_in_bin = 0;
+      int distinct_cnt_in_bin = 0;
+      int bin_start = value_ind;
+      while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) {
+        cnt_in_bin += counts[value_ind];
+        ++distinct_cnt_in_bin;
+        ++value_ind;
+      }
+      int bins_remaining = max_bin - static_cast<int>(bin_upper_bound.size()) - static_cast<int>(bounds_to_add.size());
+      int num_sub_bins = static_cast<int>(std::lround((static_cast<double>(cnt_in_bin) * bins_remaining / total_sample_cnt)));
+      num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1;
+      if (i == bin_upper_bound.size() - 1) {
+        num_sub_bins = bins_remaining + 1;
+      }
+      std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, 
+                                                            num_sub_bins, cnt_in_bin, min_data_in_bin);
+      bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
     }
+    bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
+    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
     CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }
 
   void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
-    int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
+    int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, 
+    std::vector<double> forced_upper_bounds) {
     int na_cnt = 0;
     int tmp_num_sample_values = 0;
     for (int i = 0; i < num_sample_values; ++i) {
@@ -289,14 +313,17 @@ namespace LightGBM {
     int num_distinct_values = static_cast<int>(distinct_values.size());
     if (bin_type_ == BinType::NumericalBin) {
       if (missing_type_ == MissingType::Zero) {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
+                                                   min_data_in_bin, forced_upper_bounds);
         if (bin_upper_bound_.size() == 2) {
           missing_type_ = MissingType::None;
         }
       } else if (missing_type_ == MissingType::None) {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
+                                                  min_data_in_bin, forced_upper_bounds);
       } else {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
+                                                   min_data_in_bin, forced_upper_bounds);
         bin_upper_bound_.push_back(NaN);
       }
       num_bin_ = static_cast<int>(bin_upper_bound_.size());
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 8d75b1cde3df..ad5b43811ebe 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -211,6 +211,7 @@ std::unordered_set<std::string> Config::parameter_set({
   "monotone_constraints",
   "feature_contri",
   "forcedsplits_filename",
+  "forcedbins_filename",
   "refit_decay_rate",
   "cegb_tradeoff",
   "cegb_penalty_split",
@@ -396,6 +397,8 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetString(params, "forcedsplits_filename", &forcedsplits_filename);
 
+  GetString(params, "forcedbins_filename", &forcedbins_filename);
+
   GetDouble(params, "refit_decay_rate", &refit_decay_rate);
   CHECK(refit_decay_rate >=0.0);
   CHECK(refit_decay_rate <=1.0);
@@ -608,6 +611,7 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast<int8_t, int>(monotone_constraints), ",") << "]\n";
   str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n";
   str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
+  str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n";
   str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n";
   str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n";
   str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index f201a40a1a7a..c931e945cd24 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -8,12 +8,17 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
+#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
 #include <cstdio>
 #include <sstream>
 #include <unordered_map>
+#include <fstream>
+
+using namespace json11;
+
 
 namespace LightGBM {
 
@@ -324,6 +329,7 @@ void Dataset::Construct(
     max_bin_by_feature_.resize(num_total_features_);
     max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end());
   }
+  forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_);
   max_bin_ = io_config.max_bin;
   min_data_in_bin_ = io_config.min_data_in_bin;
   bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
@@ -356,6 +362,12 @@ void Dataset::ResetConfig(const char* parameters) {
   if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
     Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
   }
+  if (param.count("forcedbins_filename")) {
+    std::vector<std::vector<double>> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_);
+    if (config_bounds != forced_bin_bounds_) {
+      Log::Warning("Cannot change forced bins after constructed Dataset handle.");
+    }
+  }
 
   if (!io_config.monotone_constraints.empty()) {
     CHECK(static_cast<size_t>(num_total_features_) == io_config.monotone_constraints.size());
@@ -657,6 +669,10 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
     for (int i = 0; i < num_total_features_; ++i) {
       size_of_header += feature_names_[i].size() + sizeof(int);
     }
+    // size of forced bins
+    for (int i = 0; i < num_total_features_; ++i) {
+      size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int);
+    }
     writer->Write(&size_of_header, sizeof(size_of_header));
     // write header
     writer->Write(&num_data_, sizeof(num_data_));
@@ -705,6 +721,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
       const char* c_str = feature_names_[i].c_str();
       writer->Write(c_str, sizeof(char) * str_len);
     }
+    // write forced bins
+    for (int i = 0; i < num_total_features_; ++i) {
+      int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
+      writer->Write(&num_bounds, sizeof(int));
+      
+      for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) {
+        writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
+      }
+    }
 
     // get size of meta data
     size_t size_of_metadata = metadata_.SizesInByte();
@@ -754,6 +779,13 @@ void Dataset::DumpTextFile(const char* text_filename) {
   for (auto n : feature_names_) {
     fprintf(file, "%s, ", n.c_str());
   }
+  fprintf(file, "\nforced_bins: ");
+  for (int i = 0; i < num_total_features_; ++i) {
+    fprintf(file, "\nfeature %d: ", i);
+    for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) {
+      fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
+    }
+  }
   std::vector<std::unique_ptr<BinIterator>> iterators;
   iterators.reserve(num_features_);
   for (int j = 0; j < num_features_; ++j) {
@@ -1005,6 +1037,7 @@ void Dataset::addFeaturesFrom(Dataset* other) {
   PushVector(feature_names_, other->feature_names_);
   PushVector(feature2subfeature_, other->feature2subfeature_);
   PushVector(group_feature_cnt_, other->group_feature_cnt_);
+  PushVector(forced_bin_bounds_, other->forced_bin_bounds_);
   feature_groups_.reserve(other->feature_groups_.size());
   for (auto& fg : other->feature_groups_) {
     feature_groups_.emplace_back(new FeatureGroup(*fg));
@@ -1027,10 +1060,39 @@ void Dataset::addFeaturesFrom(Dataset* other) {
 
   PushClearIfEmpty(monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
   PushClearIfEmpty(feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);
-
+  PushClearIfEmpty(max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1);
   num_features_ += other->num_features_;
   num_total_features_ += other->num_total_features_;
   num_groups_ += other->num_groups_;
 }
 
+
+std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features) {
+  std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
+  if (forced_bins_path != "") {
+    std::ifstream forced_bins_stream(forced_bins_path.c_str());
+    std::stringstream buffer;
+    buffer << forced_bins_stream.rdbuf();
+    std::string err;
+    Json forced_bins_json = Json::parse(buffer.str(), err);
+    CHECK(forced_bins_json.is_array());
+    std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+    for (int i = 0; i < forced_bins_arr.size(); ++i) {
+      int feature_num = forced_bins_arr[i]["feature"].int_value();
+      CHECK(feature_num < num_total_features);
+      std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+      for (int j = 0; j < bounds_arr.size(); ++j) {
+        forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+      }
+    }
+    // remove duplicates
+    for (int i = 0; i < num_total_features; ++i) {
+      auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+      forced_bins[i].erase(new_end, forced_bins[i].end());
+    }
+  }
+  return forced_bins;
+}
+
+
 }  // namespace LightGBM
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 1130d803ea36..f36d5b1df27d 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -3,7 +3,6 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #include <LightGBM/dataset_loader.h>
-
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>
@@ -458,6 +457,21 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
     }
     dataset->feature_names_.emplace_back(str_buf.str());
   }
+  // get forced_bin_bounds_
+  dataset->forced_bin_bounds_ = std::vector<std::vector<double>>(dataset->num_total_features_, std::vector<double>());
+  for (int i = 0; i < dataset->num_total_features_; ++i) {
+    int num_bounds = *(reinterpret_cast<const int*>(mem_ptr));
+    mem_ptr += sizeof(int);
+    dataset->forced_bin_bounds_[i] = std::vector<double>();
+    const double* tmp_ptr_forced_bounds = reinterpret_cast<const double*>(mem_ptr);
+    
+    for (int j = 0; j < num_bounds; ++j) {
+      double bound = tmp_ptr_forced_bounds[j];
+      dataset->forced_bin_bounds_[i].push_back(bound);
+    }
+    mem_ptr += num_bounds * sizeof(double);
+   
+  }
 
   // read size of meta data
   read_cnt = reader->Read(buffer.data(), sizeof(size_t));
@@ -549,6 +563,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
   return dataset.release();
 }
 
+
 Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
                                                int** sample_indices, int num_col, const int* num_per_col,
                                                size_t total_sample_size, data_size_t num_data) {
@@ -565,6 +580,11 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
     CHECK(static_cast<size_t>(num_col) == config_.max_bin_by_feature.size());
     CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
   }
+
+  // get forced split
+  std::string forced_bins_path = config_.forcedbins_filename;
+  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col);
+
   const data_size_t filter_cnt = static_cast<data_size_t>(
     static_cast<double>(config_.min_data_in_leaf * total_sample_size) / num_data);
   if (Network::num_machines() == 1) {
@@ -585,12 +605,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
       if (config_.max_bin_by_feature.empty()) {
         bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
                                 config_.max_bin, config_.min_data_in_bin, filter_cnt,
-                                bin_type, config_.use_missing, config_.zero_as_missing);
+                                bin_type, config_.use_missing, config_.zero_as_missing,
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
                                 config_.max_bin_by_feature[i], config_.min_data_in_bin,
                                 filter_cnt, bin_type, config_.use_missing,
-                                config_.zero_as_missing);
+                                config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
@@ -630,12 +651,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
       if (config_.max_bin_by_feature.empty()) {
         bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
                                 total_sample_size, config_.max_bin, config_.min_data_in_bin,
-                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
+                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, 
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
                                 total_sample_size, config_.max_bin_by_feature[start[rank] + i],
                                 config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
-                                config_.zero_as_missing);
+                                config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
@@ -872,6 +894,10 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
     CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
   }
 
+  // get forced split
+  std::string forced_bins_path = config_.forcedbins_filename;
+  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_);
+
   // check the range of label_idx, weight_idx and group_idx
   CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_);
   CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_);
@@ -909,12 +935,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
       if (config_.max_bin_by_feature.empty()) {
         bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
                                 sample_data.size(), config_.max_bin, config_.min_data_in_bin,
-                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
+                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing,
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
                                 sample_data.size(), config_.max_bin_by_feature[i],
                                 config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
-                                config_.zero_as_missing);
+                                config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
@@ -955,13 +982,14 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
         bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
                                 static_cast<int>(sample_values[start[rank] + i].size()),
                                 sample_data.size(), config_.max_bin, config_.min_data_in_bin,
-                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
+                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, 
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
                                 static_cast<int>(sample_values[start[rank] + i].size()),
                                 sample_data.size(), config_.max_bin_by_feature[i],
                                 config_.min_data_in_bin, filter_cnt, bin_type,
-                                config_.use_missing, config_.zero_as_missing);
+                                config_.use_missing, config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json
new file mode 100644
index 000000000000..aa74c36ffb78
--- /dev/null
+++ b/tests/data/forced_bins.json
@@ -0,0 +1,10 @@
+[
+    {
+        "feature": 0,
+        "bin_upper_bound": [ 0.3, 0.35, 0.4 ]
+    },
+    {
+        "feature": 1,
+        "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
+    }
+]
\ No newline at end of file
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 2039742dc9ff..4eb1e2cb8e38 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1590,3 +1590,33 @@ def constant_metric(preds, train_data):
                                                          decreasing_metric(preds, train_data)],
                         early_stopping_rounds=5, verbose_eval=False)
         self.assertEqual(gbm.best_iteration, 1)
+
+    def test_forced_bins(self):
+        x = np.zeros((100, 2))
+        x[:, 0] = np.arange(0, 1, 0.01)
+        x[:, 1] = -np.arange(0, 1, 0.01)
+        y = np.arange(0, 1, 0.01)
+        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json')
+        params = {'objective': 'regression_l1',
+                  'max_bin': 6,
+                  'forcedbins_filename': forcedbins_filename,
+                  'num_leaves': 2,
+                  'min_data_in_leaf': 1,
+                  'verbose': -1,
+                  'seed': 0}
+        lgb_x = lgb.Dataset(x, label=y)
+        est = lgb.train(params, lgb_x, num_boost_round=100)
+        new_x = np.zeros((3, x.shape[1]))
+        new_x[:, 0] = [0.31, 0.37, 0.41]
+        new_x[:, 1] = [0, 0, 0]
+        predicted = est.predict(new_x)
+        self.assertEqual(len(np.unique(predicted)), 3)
+        new_x[:, 0] = [0, 0, 0]
+        new_x[:, 1] = [-0.25, -0.5, -0.9]
+        predicted = est.predict(new_x)
+        self.assertEqual(len(np.unique(predicted)), 1)
+        params['forcedbins_filename'] = ''
+        lgb_x = lgb.Dataset(x, label=y)
+        est = lgb.train(params, lgb_x, num_boost_round=100)
+        predicted = est.predict(new_x)
+        self.assertEqual(len(np.unique(predicted)), 3)

From 6c2d048c79075be9f124d46553715dbeae06471d Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:10:21 +1000
Subject: [PATCH 08/49] Fix style issues.

---
 docs/Parameters.rst                           |  4 +-
 .../regression}/forced_bins.json              |  2 +-
 examples/regression/train.conf                |  3 ++
 include/LightGBM/config.h                     |  4 +-
 src/io/bin.cpp                                |  2 +-
 src/io/dataset.cpp                            | 40 ++++++++++---------
 src/io/dataset_loader.cpp                     |  1 +
 tests/python_package_test/test_engine.py      |  3 +-
 8 files changed, 34 insertions(+), 25 deletions(-)
 rename {tests/data => examples/regression}/forced_bins.json (98%)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 10105bfbed5a..c4f45f0010c4 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -408,9 +408,9 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
 
-   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
 
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
diff --git a/tests/data/forced_bins.json b/examples/regression/forced_bins.json
similarity index 98%
rename from tests/data/forced_bins.json
rename to examples/regression/forced_bins.json
index aa74c36ffb78..1ee0a49d727c 100644
--- a/tests/data/forced_bins.json
+++ b/examples/regression/forced_bins.json
@@ -7,4 +7,4 @@
         "feature": 1,
         "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
     }
-]
\ No newline at end of file
+]
diff --git a/examples/regression/train.conf b/examples/regression/train.conf
index 11396c23ecc2..4c73169dc8f9 100644
--- a/examples/regression/train.conf
+++ b/examples/regression/train.conf
@@ -29,6 +29,9 @@ is_training_metric = true
 # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
 max_bin = 255
 
+# forced bin thresholds
+# forcedbins_filename = forced_bins.json
+
 # training data
 # if exsting weight file, should name to "regression.train.weight"
 # alias: train_data, train
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index d2a953ddb416..56903a9b96ae 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -403,8 +403,8 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
-  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
   // check = >=0.0
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 62713d1bddd3..2556a59b4715 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -320,7 +320,7 @@ namespace LightGBM {
         }
       } else if (missing_type_ == MissingType::None) {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
-                                                  min_data_in_bin, forced_upper_bounds);
+                                                   min_data_in_bin, forced_upper_bounds);
       } else {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
                                                    min_data_in_bin, forced_upper_bounds);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index c931e945cd24..269c06c4c37d 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -5,10 +5,10 @@
 #include <LightGBM/dataset.h>
 
 #include <LightGBM/feature_group.h>
+#include <LightGBM/json11.hpp>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
-#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
@@ -1071,24 +1071,28 @@ std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_
   std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
   if (forced_bins_path != "") {
     std::ifstream forced_bins_stream(forced_bins_path.c_str());
-    std::stringstream buffer;
-    buffer << forced_bins_stream.rdbuf();
-    std::string err;
-    Json forced_bins_json = Json::parse(buffer.str(), err);
-    CHECK(forced_bins_json.is_array());
-    std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
-    for (int i = 0; i < forced_bins_arr.size(); ++i) {
-      int feature_num = forced_bins_arr[i]["feature"].int_value();
-      CHECK(feature_num < num_total_features);
-      std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
-      for (int j = 0; j < bounds_arr.size(); ++j) {
-        forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+    if (forced_bins_stream.fail()) {
+      Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str());
+    } else {
+      std::stringstream buffer;
+      buffer << forced_bins_stream.rdbuf();
+      std::string err;
+      Json forced_bins_json = Json::parse(buffer.str(), err);
+      CHECK(forced_bins_json.is_array());
+      std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+      for (int i = 0; i < forced_bins_arr.size(); ++i) {
+        int feature_num = forced_bins_arr[i]["feature"].int_value();
+        CHECK(feature_num < num_total_features);
+        std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+        for (int j = 0; j < bounds_arr.size(); ++j) {
+          forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+        }
+      }
+      // remove duplicates
+      for (int i = 0; i < num_total_features; ++i) {
+        auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+        forced_bins[i].erase(new_end, forced_bins[i].end());
       }
-    }
-    // remove duplicates
-    for (int i = 0; i < num_total_features; ++i) {
-      auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
-      forced_bins[i].erase(new_end, forced_bins[i].end());
     }
   }
   return forced_bins;
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index f36d5b1df27d..eb83d74bfe3d 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,6 +2,7 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
 #include <LightGBM/dataset_loader.h>
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 4eb1e2cb8e38..2420ee9ec853 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1596,7 +1596,8 @@ def test_forced_bins(self):
         x[:, 0] = np.arange(0, 1, 0.01)
         x[:, 1] = -np.arange(0, 1, 0.01)
         y = np.arange(0, 1, 0.01)
-        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json')
+        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                           '../../examples/regression/forced_bins.json')
         params = {'objective': 'regression_l1',
                   'max_bin': 6,
                   'forcedbins_filename': forcedbins_filename,

From feb861f3f326787eab3e92e2945a6a8d9fdbd16b Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:19:58 +1000
Subject: [PATCH 09/49] Use stable sort.

---
 src/io/bin.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 2556a59b4715..b26a6a461e3e 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -213,7 +213,7 @@ namespace LightGBM {
     if (num_to_insert > 0) {
       bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
     }
-    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
 
     // find remaining bounds
     std::vector<double> bounds_to_add;
@@ -238,7 +238,7 @@ namespace LightGBM {
       bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
-    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
     CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }

From 873fa64a2ba3ace8ad94271a4bbd37c0c3a6add5 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Thu, 15 Aug 2019 19:17:19 +1000
Subject: [PATCH 10/49] Minor style and doc fixes.

---
 docs/Parameters.rst       | 2 +-
 include/LightGBM/config.h | 2 +-
 src/io/dataset_loader.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index c4f45f0010c4..28777637d100 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -408,7 +408,7 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
 
    -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 56903a9b96ae..b67ee9656468 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -403,7 +403,7 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
   // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index eb83d74bfe3d..c00b9b7fdae5 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,8 +2,8 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
 #include <LightGBM/dataset_loader.h>
+
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>

From 4cd89e48bba33bf0ae3978bb530a214963d0d59b Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 20 Aug 2019 21:26:06 +1000
Subject: [PATCH 11/49] Change binning behavior to be same as PR #2342.

---
 src/io/bin.cpp                           | 14 +++++++----
 tests/python_package_test/test_engine.py | 31 +++++++++++++++++++++---
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index b26a6a461e3e..40da30c6ad2d 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -186,7 +186,7 @@ namespace LightGBM {
       }
     }
 
-    // include zero bounds if possible
+    // include zero bounds and infinity bound
     if (max_bin == 2) {
       if (left_cnt == 0) {
         bin_upper_bound.push_back(kZeroThreshold);
@@ -194,9 +194,14 @@ namespace LightGBM {
         bin_upper_bound.push_back(-kZeroThreshold);
       }
     } else if (max_bin >= 3) {
-      bin_upper_bound.push_back(-kZeroThreshold);
-      bin_upper_bound.push_back(kZeroThreshold);
+      if (left_cnt > 0) {
+        bin_upper_bound.push_back(-kZeroThreshold);
+      }
+      if (right_start >= 0) {
+        bin_upper_bound.push_back(kZeroThreshold);
+      }
     }
+    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
     
     // add forced bounds, excluding zeros since we have already added zero bounds
     int i = 0;
@@ -207,7 +212,6 @@ namespace LightGBM {
         ++i;
       }
     }
-    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
     int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
     int num_to_insert = std::min(max_to_insert, static_cast<int>(forced_upper_bounds.size()));
     if (num_to_insert > 0) {
@@ -239,7 +243,7 @@ namespace LightGBM {
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
     std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
-    CHECK(bin_upper_bound.size() <= max_bin);
+    CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin));
     return bin_upper_bound;
   }
 
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 2420ee9ec853..9f807d64b102 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -921,7 +921,7 @@ def test_max_bin_by_feature(self):
         }
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)
-        self.assertEqual(len(np.unique(est.predict(X))), 99)
+        self.assertEqual(len(np.unique(est.predict(X))), 100)
         params['max_bin_by_feature'] = [2, 100]
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)
@@ -1599,7 +1599,7 @@ def test_forced_bins(self):
         forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                            '../../examples/regression/forced_bins.json')
         params = {'objective': 'regression_l1',
-                  'max_bin': 6,
+                  'max_bin': 5,
                   'forcedbins_filename': forcedbins_filename,
                   'num_leaves': 2,
                   'min_data_in_leaf': 1,
@@ -1613,7 +1613,7 @@ def test_forced_bins(self):
         predicted = est.predict(new_x)
         self.assertEqual(len(np.unique(predicted)), 3)
         new_x[:, 0] = [0, 0, 0]
-        new_x[:, 1] = [-0.25, -0.5, -0.9]
+        new_x[:, 1] = [-0.9, -0.6, -0.3]
         predicted = est.predict(new_x)
         self.assertEqual(len(np.unique(predicted)), 1)
         params['forcedbins_filename'] = ''
@@ -1621,3 +1621,28 @@ def test_forced_bins(self):
         est = lgb.train(params, lgb_x, num_boost_round=100)
         predicted = est.predict(new_x)
         self.assertEqual(len(np.unique(predicted)), 3)
+
+    def test_binning_same_sign(self):
+        # test that binning works properly for features with only positive or only negative values
+        x = np.zeros((99, 2))
+        x[:, 0] = np.arange(0.01, 1, 0.01)
+        x[:, 1] = -np.arange(0.01, 1, 0.01)
+        y = np.arange(0.01, 1, 0.01)
+        params = {'objective': 'regression_l1',
+                  'max_bin': 5,
+                  'num_leaves': 2,
+                  'min_data_in_leaf': 1,
+                  'verbose': -1,
+                  'seed': 0}
+        lgb_x = lgb.Dataset(x, label=y)
+        est = lgb.train(params, lgb_x, num_boost_round=100)
+        new_x = np.zeros((3, 2))
+        new_x[:, 0] = [-1, 0, 1]
+        predicted = est.predict(new_x)
+        self.assertAlmostEqual(predicted[0], predicted[1])
+        self.assertNotAlmostEqual(predicted[1], predicted[2])
+        new_x = np.zeros((3, 2))
+        new_x[:, 1] = [-1, 0, 1]
+        predicted = est.predict(new_x)
+        self.assertNotAlmostEqual(predicted[0], predicted[1])
+        self.assertAlmostEqual(predicted[1], predicted[2])

From 9d22071dccedea825c862b02a65a1bef3fb8ce23 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 13 Aug 2019 18:14:54 +1000
Subject: [PATCH 12/49] Add functionality to force bin thresholds.

---
 docs/Parameters.rst                      |  8 +++
 include/LightGBM/bin.h                   |  3 +-
 include/LightGBM/config.h                |  5 ++
 include/LightGBM/dataset.h               |  3 +
 src/io/bin.cpp                           | 88 +++++++++++++++++-------
 src/io/config_auto.cpp                   |  4 ++
 src/io/dataset.cpp                       | 64 ++++++++++++++++-
 src/io/dataset_loader.cpp                | 46 ++++++++++---
 tests/data/forced_bins.json              | 10 +++
 tests/python_package_test/test_engine.py | 32 ++++++++-
 10 files changed, 227 insertions(+), 36 deletions(-)
 create mode 100644 tests/data/forced_bins.json

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 8c16e190d223..10105bfbed5a 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -404,6 +404,14 @@ Learning Control Parameters
 
    -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
 
+-  ``forcedbins_filename`` :raw-html:`<a id="forcedbins_filename" title="Permalink to this parameter" href="#forcedbins_filename">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string
+
+   -  path to a ``.json`` file that specifies bin upper bounds for some or all features
+
+   -  ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
    -  decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index 46baee58fc46..1c5f62cd1907 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -146,9 +146,10 @@ class BinMapper {
   * \param bin_type Type of this bin
   * \param use_missing True to enable missing value handle
   * \param zero_as_missing True to use zero as missing value
+  * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
   */
   void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
-               bool use_missing, bool zero_as_missing);
+               bool use_missing, bool zero_as_missing, std::vector<double> forced_upper_bounds);
 
   /*!
   * \brief Use specific number of bin to calculate the size of this class
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 190e239cf5a7..d2a953ddb416 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -402,6 +402,11 @@ struct Config {
   // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
   std::string forcedsplits_filename = "";
 
+  // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
+  // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+  std::string forcedbins_filename = "";
+
   // check = >=0.0
   // check = <=1.0
   // desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index e688522fbb1a..900487eafbf4 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -596,6 +596,8 @@ class Dataset {
 
   void addFeaturesFrom(Dataset* other);
 
+  static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features);
+
  private:
   std::string data_filename_;
   /*! \brief Store used features */
@@ -630,6 +632,7 @@ class Dataset {
   bool is_finish_load_;
   int max_bin_;
   std::vector<int32_t> max_bin_by_feature_;
+  std::vector<std::vector<double>> forced_bin_bounds_;
   int bin_construct_sample_cnt_;
   int min_data_in_bin_;
   bool use_missing_;
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 2e79a80266b6..62713d1bddd3 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -150,8 +150,10 @@ namespace LightGBM {
   }
 
   std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
-    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
+    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double> forced_upper_bounds) {
     std::vector<double> bin_upper_bound;
+
+    // get list of distinct values
     int left_cnt_data = 0;
     int cnt_zero = 0;
     int right_cnt_data = 0;
@@ -165,6 +167,7 @@ namespace LightGBM {
       }
     }
 
+    // get number of positive and negative distinct values
     int left_cnt = -1;
     for (int i = 0; i < num_distinct_values; ++i) {
       if (distinct_values[i] > -kZeroThreshold) {
@@ -172,20 +175,9 @@ namespace LightGBM {
         break;
       }
     }
-
     if (left_cnt < 0) {
       left_cnt = num_distinct_values;
     }
-
-    if ((left_cnt > 0) && (max_bin > 1)) {
-      int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
-      left_max_bin = std::max(1, left_max_bin);
-      bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
-      if (bin_upper_bound.size() > 0) {
-        bin_upper_bound.back() = -kZeroThreshold;
-      }
-    }
-
     int right_start = -1;
     for (int i = left_cnt; i < num_distinct_values; ++i) {
       if (distinct_values[i] > kZeroThreshold) {
@@ -194,21 +186,66 @@ namespace LightGBM {
       }
     }
 
-    int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
-    if (right_start >= 0 && right_max_bin > 0) {
-      auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
-        num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
+    // include zero bounds if possible
+    if (max_bin == 2) {
+      if (left_cnt == 0) {
+        bin_upper_bound.push_back(kZeroThreshold);
+      } else {
+        bin_upper_bound.push_back(-kZeroThreshold);
+      }
+    } else if (max_bin >= 3) {
+      bin_upper_bound.push_back(-kZeroThreshold);
       bin_upper_bound.push_back(kZeroThreshold);
-      bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
-    } else {
-      bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
     }
-    CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin));
+    
+    // add forced bounds, excluding zeros since we have already added zero bounds
+    int i = 0;
+    while (i < forced_upper_bounds.size()) {
+      if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) {
+        forced_upper_bounds.erase(forced_upper_bounds.begin() + i);
+      } else {
+        ++i;
+      }
+    }
+    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
+    int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
+    int num_to_insert = std::min(max_to_insert, static_cast<int>(forced_upper_bounds.size()));
+    if (num_to_insert > 0) {
+      bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
+    }
+    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+
+    // find remaining bounds
+    std::vector<double> bounds_to_add;
+    int value_ind = 0;
+    for (int i = 0; i < bin_upper_bound.size(); ++i) {
+      int cnt_in_bin = 0;
+      int distinct_cnt_in_bin = 0;
+      int bin_start = value_ind;
+      while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) {
+        cnt_in_bin += counts[value_ind];
+        ++distinct_cnt_in_bin;
+        ++value_ind;
+      }
+      int bins_remaining = max_bin - static_cast<int>(bin_upper_bound.size()) - static_cast<int>(bounds_to_add.size());
+      int num_sub_bins = static_cast<int>(std::lround((static_cast<double>(cnt_in_bin) * bins_remaining / total_sample_cnt)));
+      num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1;
+      if (i == bin_upper_bound.size() - 1) {
+        num_sub_bins = bins_remaining + 1;
+      }
+      std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, 
+                                                            num_sub_bins, cnt_in_bin, min_data_in_bin);
+      bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
+    }
+    bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
+    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }
 
   void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
-    int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
+    int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, 
+    std::vector<double> forced_upper_bounds) {
     int na_cnt = 0;
     int tmp_num_sample_values = 0;
     for (int i = 0; i < num_sample_values; ++i) {
@@ -276,14 +313,17 @@ namespace LightGBM {
     int num_distinct_values = static_cast<int>(distinct_values.size());
     if (bin_type_ == BinType::NumericalBin) {
       if (missing_type_ == MissingType::Zero) {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
+                                                   min_data_in_bin, forced_upper_bounds);
         if (bin_upper_bound_.size() == 2) {
           missing_type_ = MissingType::None;
         }
       } else if (missing_type_ == MissingType::None) {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
+                                                  min_data_in_bin, forced_upper_bounds);
       } else {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
+                                                   min_data_in_bin, forced_upper_bounds);
         bin_upper_bound_.push_back(NaN);
       }
       num_bin_ = static_cast<int>(bin_upper_bound_.size());
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 8d75b1cde3df..ad5b43811ebe 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -211,6 +211,7 @@ std::unordered_set<std::string> Config::parameter_set({
   "monotone_constraints",
   "feature_contri",
   "forcedsplits_filename",
+  "forcedbins_filename",
   "refit_decay_rate",
   "cegb_tradeoff",
   "cegb_penalty_split",
@@ -396,6 +397,8 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetString(params, "forcedsplits_filename", &forcedsplits_filename);
 
+  GetString(params, "forcedbins_filename", &forcedbins_filename);
+
   GetDouble(params, "refit_decay_rate", &refit_decay_rate);
   CHECK(refit_decay_rate >=0.0);
   CHECK(refit_decay_rate <=1.0);
@@ -608,6 +611,7 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast<int8_t, int>(monotone_constraints), ",") << "]\n";
   str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n";
   str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
+  str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n";
   str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n";
   str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n";
   str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index f201a40a1a7a..c931e945cd24 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -8,12 +8,17 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
+#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
 #include <cstdio>
 #include <sstream>
 #include <unordered_map>
+#include <fstream>
+
+using namespace json11;
+
 
 namespace LightGBM {
 
@@ -324,6 +329,7 @@ void Dataset::Construct(
     max_bin_by_feature_.resize(num_total_features_);
     max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end());
   }
+  forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_);
   max_bin_ = io_config.max_bin;
   min_data_in_bin_ = io_config.min_data_in_bin;
   bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
@@ -356,6 +362,12 @@ void Dataset::ResetConfig(const char* parameters) {
   if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
     Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
   }
+  if (param.count("forcedbins_filename")) {
+    std::vector<std::vector<double>> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_);
+    if (config_bounds != forced_bin_bounds_) {
+      Log::Warning("Cannot change forced bins after constructed Dataset handle.");
+    }
+  }
 
   if (!io_config.monotone_constraints.empty()) {
     CHECK(static_cast<size_t>(num_total_features_) == io_config.monotone_constraints.size());
@@ -657,6 +669,10 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
     for (int i = 0; i < num_total_features_; ++i) {
       size_of_header += feature_names_[i].size() + sizeof(int);
     }
+    // size of forced bins
+    for (int i = 0; i < num_total_features_; ++i) {
+      size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int);
+    }
     writer->Write(&size_of_header, sizeof(size_of_header));
     // write header
     writer->Write(&num_data_, sizeof(num_data_));
@@ -705,6 +721,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
       const char* c_str = feature_names_[i].c_str();
       writer->Write(c_str, sizeof(char) * str_len);
     }
+    // write forced bins
+    for (int i = 0; i < num_total_features_; ++i) {
+      int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
+      writer->Write(&num_bounds, sizeof(int));
+      
+      for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) {
+        writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
+      }
+    }
 
     // get size of meta data
     size_t size_of_metadata = metadata_.SizesInByte();
@@ -754,6 +779,13 @@ void Dataset::DumpTextFile(const char* text_filename) {
   for (auto n : feature_names_) {
     fprintf(file, "%s, ", n.c_str());
   }
+  fprintf(file, "\nforced_bins: ");
+  for (int i = 0; i < num_total_features_; ++i) {
+    fprintf(file, "\nfeature %d: ", i);
+    for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) {
+      fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
+    }
+  }
   std::vector<std::unique_ptr<BinIterator>> iterators;
   iterators.reserve(num_features_);
   for (int j = 0; j < num_features_; ++j) {
@@ -1005,6 +1037,7 @@ void Dataset::addFeaturesFrom(Dataset* other) {
   PushVector(feature_names_, other->feature_names_);
   PushVector(feature2subfeature_, other->feature2subfeature_);
   PushVector(group_feature_cnt_, other->group_feature_cnt_);
+  PushVector(forced_bin_bounds_, other->forced_bin_bounds_);
   feature_groups_.reserve(other->feature_groups_.size());
   for (auto& fg : other->feature_groups_) {
     feature_groups_.emplace_back(new FeatureGroup(*fg));
@@ -1027,10 +1060,39 @@ void Dataset::addFeaturesFrom(Dataset* other) {
 
   PushClearIfEmpty(monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
   PushClearIfEmpty(feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);
-
+  PushClearIfEmpty(max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1);
   num_features_ += other->num_features_;
   num_total_features_ += other->num_total_features_;
   num_groups_ += other->num_groups_;
 }
 
+
+std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features) {
+  std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
+  if (forced_bins_path != "") {
+    std::ifstream forced_bins_stream(forced_bins_path.c_str());
+    std::stringstream buffer;
+    buffer << forced_bins_stream.rdbuf();
+    std::string err;
+    Json forced_bins_json = Json::parse(buffer.str(), err);
+    CHECK(forced_bins_json.is_array());
+    std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+    for (int i = 0; i < forced_bins_arr.size(); ++i) {
+      int feature_num = forced_bins_arr[i]["feature"].int_value();
+      CHECK(feature_num < num_total_features);
+      std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+      for (int j = 0; j < bounds_arr.size(); ++j) {
+        forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+      }
+    }
+    // remove duplicates
+    for (int i = 0; i < num_total_features; ++i) {
+      auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+      forced_bins[i].erase(new_end, forced_bins[i].end());
+    }
+  }
+  return forced_bins;
+}
+
+
 }  // namespace LightGBM
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 1130d803ea36..f36d5b1df27d 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -3,7 +3,6 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #include <LightGBM/dataset_loader.h>
-
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>
@@ -458,6 +457,21 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
     }
     dataset->feature_names_.emplace_back(str_buf.str());
   }
+  // get forced_bin_bounds_
+  dataset->forced_bin_bounds_ = std::vector<std::vector<double>>(dataset->num_total_features_, std::vector<double>());
+  for (int i = 0; i < dataset->num_total_features_; ++i) {
+    int num_bounds = *(reinterpret_cast<const int*>(mem_ptr));
+    mem_ptr += sizeof(int);
+    dataset->forced_bin_bounds_[i] = std::vector<double>();
+    const double* tmp_ptr_forced_bounds = reinterpret_cast<const double*>(mem_ptr);
+    
+    for (int j = 0; j < num_bounds; ++j) {
+      double bound = tmp_ptr_forced_bounds[j];
+      dataset->forced_bin_bounds_[i].push_back(bound);
+    }
+    mem_ptr += num_bounds * sizeof(double);
+   
+  }
 
   // read size of meta data
   read_cnt = reader->Read(buffer.data(), sizeof(size_t));
@@ -549,6 +563,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
   return dataset.release();
 }
 
+
 Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
                                                int** sample_indices, int num_col, const int* num_per_col,
                                                size_t total_sample_size, data_size_t num_data) {
@@ -565,6 +580,11 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
     CHECK(static_cast<size_t>(num_col) == config_.max_bin_by_feature.size());
     CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
   }
+
+  // get forced split
+  std::string forced_bins_path = config_.forcedbins_filename;
+  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col);
+
   const data_size_t filter_cnt = static_cast<data_size_t>(
     static_cast<double>(config_.min_data_in_leaf * total_sample_size) / num_data);
   if (Network::num_machines() == 1) {
@@ -585,12 +605,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
       if (config_.max_bin_by_feature.empty()) {
         bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
                                 config_.max_bin, config_.min_data_in_bin, filter_cnt,
-                                bin_type, config_.use_missing, config_.zero_as_missing);
+                                bin_type, config_.use_missing, config_.zero_as_missing,
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
                                 config_.max_bin_by_feature[i], config_.min_data_in_bin,
                                 filter_cnt, bin_type, config_.use_missing,
-                                config_.zero_as_missing);
+                                config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
@@ -630,12 +651,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
       if (config_.max_bin_by_feature.empty()) {
         bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
                                 total_sample_size, config_.max_bin, config_.min_data_in_bin,
-                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
+                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, 
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
                                 total_sample_size, config_.max_bin_by_feature[start[rank] + i],
                                 config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
-                                config_.zero_as_missing);
+                                config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
@@ -872,6 +894,10 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
     CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
   }
 
+  // get forced split
+  std::string forced_bins_path = config_.forcedbins_filename;
+  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_);
+
   // check the range of label_idx, weight_idx and group_idx
   CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_);
   CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_);
@@ -909,12 +935,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
       if (config_.max_bin_by_feature.empty()) {
         bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
                                 sample_data.size(), config_.max_bin, config_.min_data_in_bin,
-                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
+                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing,
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
                                 sample_data.size(), config_.max_bin_by_feature[i],
                                 config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
-                                config_.zero_as_missing);
+                                config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
@@ -955,13 +982,14 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
         bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
                                 static_cast<int>(sample_values[start[rank] + i].size()),
                                 sample_data.size(), config_.max_bin, config_.min_data_in_bin,
-                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
+                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, 
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
                                 static_cast<int>(sample_values[start[rank] + i].size()),
                                 sample_data.size(), config_.max_bin_by_feature[i],
                                 config_.min_data_in_bin, filter_cnt, bin_type,
-                                config_.use_missing, config_.zero_as_missing);
+                                config_.use_missing, config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json
new file mode 100644
index 000000000000..aa74c36ffb78
--- /dev/null
+++ b/tests/data/forced_bins.json
@@ -0,0 +1,10 @@
+[
+    {
+        "feature": 0,
+        "bin_upper_bound": [ 0.3, 0.35, 0.4 ]
+    },
+    {
+        "feature": 1,
+        "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
+    }
+]
\ No newline at end of file
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 9a34de869724..4eb1e2cb8e38 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -921,7 +921,7 @@ def test_max_bin_by_feature(self):
         }
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)
-        self.assertEqual(len(np.unique(est.predict(X))), 100)
+        self.assertEqual(len(np.unique(est.predict(X))), 99)
         params['max_bin_by_feature'] = [2, 100]
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)
@@ -1590,3 +1590,33 @@ def constant_metric(preds, train_data):
                                                          decreasing_metric(preds, train_data)],
                         early_stopping_rounds=5, verbose_eval=False)
         self.assertEqual(gbm.best_iteration, 1)
+
+    def test_forced_bins(self):
+        x = np.zeros((100, 2))
+        x[:, 0] = np.arange(0, 1, 0.01)
+        x[:, 1] = -np.arange(0, 1, 0.01)
+        y = np.arange(0, 1, 0.01)
+        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json')
+        params = {'objective': 'regression_l1',
+                  'max_bin': 6,
+                  'forcedbins_filename': forcedbins_filename,
+                  'num_leaves': 2,
+                  'min_data_in_leaf': 1,
+                  'verbose': -1,
+                  'seed': 0}
+        lgb_x = lgb.Dataset(x, label=y)
+        est = lgb.train(params, lgb_x, num_boost_round=100)
+        new_x = np.zeros((3, x.shape[1]))
+        new_x[:, 0] = [0.31, 0.37, 0.41]
+        new_x[:, 1] = [0, 0, 0]
+        predicted = est.predict(new_x)
+        self.assertEqual(len(np.unique(predicted)), 3)
+        new_x[:, 0] = [0, 0, 0]
+        new_x[:, 1] = [-0.25, -0.5, -0.9]
+        predicted = est.predict(new_x)
+        self.assertEqual(len(np.unique(predicted)), 1)
+        params['forcedbins_filename'] = ''
+        lgb_x = lgb.Dataset(x, label=y)
+        est = lgb.train(params, lgb_x, num_boost_round=100)
+        predicted = est.predict(new_x)
+        self.assertEqual(len(np.unique(predicted)), 3)

From 3178609187909da91dd3f5a6dd4e67fc0de30065 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:10:21 +1000
Subject: [PATCH 13/49] Fix style issues.

---
 docs/Parameters.rst                           |  4 +-
 .../regression}/forced_bins.json              |  2 +-
 examples/regression/train.conf                |  3 ++
 include/LightGBM/config.h                     |  4 +-
 src/io/bin.cpp                                |  2 +-
 src/io/dataset.cpp                            | 40 ++++++++++---------
 src/io/dataset_loader.cpp                     |  1 +
 tests/python_package_test/test_engine.py      |  3 +-
 8 files changed, 34 insertions(+), 25 deletions(-)
 rename {tests/data => examples/regression}/forced_bins.json (98%)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 10105bfbed5a..c4f45f0010c4 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -408,9 +408,9 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
 
-   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
 
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
diff --git a/tests/data/forced_bins.json b/examples/regression/forced_bins.json
similarity index 98%
rename from tests/data/forced_bins.json
rename to examples/regression/forced_bins.json
index aa74c36ffb78..1ee0a49d727c 100644
--- a/tests/data/forced_bins.json
+++ b/examples/regression/forced_bins.json
@@ -7,4 +7,4 @@
         "feature": 1,
         "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
     }
-]
\ No newline at end of file
+]
diff --git a/examples/regression/train.conf b/examples/regression/train.conf
index 11396c23ecc2..4c73169dc8f9 100644
--- a/examples/regression/train.conf
+++ b/examples/regression/train.conf
@@ -29,6 +29,9 @@ is_training_metric = true
 # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
 max_bin = 255
 
+# forced bin thresholds
+# forcedbins_filename = forced_bins.json
+
 # training data
 # if exsting weight file, should name to "regression.train.weight"
 # alias: train_data, train
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index d2a953ddb416..56903a9b96ae 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -403,8 +403,8 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
-  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
   // check = >=0.0
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 62713d1bddd3..2556a59b4715 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -320,7 +320,7 @@ namespace LightGBM {
         }
       } else if (missing_type_ == MissingType::None) {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
-                                                  min_data_in_bin, forced_upper_bounds);
+                                                   min_data_in_bin, forced_upper_bounds);
       } else {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
                                                    min_data_in_bin, forced_upper_bounds);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index c931e945cd24..269c06c4c37d 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -5,10 +5,10 @@
 #include <LightGBM/dataset.h>
 
 #include <LightGBM/feature_group.h>
+#include <LightGBM/json11.hpp>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
-#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
@@ -1071,24 +1071,28 @@ std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_
   std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
   if (forced_bins_path != "") {
     std::ifstream forced_bins_stream(forced_bins_path.c_str());
-    std::stringstream buffer;
-    buffer << forced_bins_stream.rdbuf();
-    std::string err;
-    Json forced_bins_json = Json::parse(buffer.str(), err);
-    CHECK(forced_bins_json.is_array());
-    std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
-    for (int i = 0; i < forced_bins_arr.size(); ++i) {
-      int feature_num = forced_bins_arr[i]["feature"].int_value();
-      CHECK(feature_num < num_total_features);
-      std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
-      for (int j = 0; j < bounds_arr.size(); ++j) {
-        forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+    if (forced_bins_stream.fail()) {
+      Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str());
+    } else {
+      std::stringstream buffer;
+      buffer << forced_bins_stream.rdbuf();
+      std::string err;
+      Json forced_bins_json = Json::parse(buffer.str(), err);
+      CHECK(forced_bins_json.is_array());
+      std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+      for (int i = 0; i < forced_bins_arr.size(); ++i) {
+        int feature_num = forced_bins_arr[i]["feature"].int_value();
+        CHECK(feature_num < num_total_features);
+        std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+        for (int j = 0; j < bounds_arr.size(); ++j) {
+          forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+        }
+      }
+      // remove duplicates
+      for (int i = 0; i < num_total_features; ++i) {
+        auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+        forced_bins[i].erase(new_end, forced_bins[i].end());
       }
-    }
-    // remove duplicates
-    for (int i = 0; i < num_total_features; ++i) {
-      auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
-      forced_bins[i].erase(new_end, forced_bins[i].end());
     }
   }
   return forced_bins;
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index f36d5b1df27d..eb83d74bfe3d 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,6 +2,7 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
 #include <LightGBM/dataset_loader.h>
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 4eb1e2cb8e38..2420ee9ec853 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1596,7 +1596,8 @@ def test_forced_bins(self):
         x[:, 0] = np.arange(0, 1, 0.01)
         x[:, 1] = -np.arange(0, 1, 0.01)
         y = np.arange(0, 1, 0.01)
-        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json')
+        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                           '../../examples/regression/forced_bins.json')
         params = {'objective': 'regression_l1',
                   'max_bin': 6,
                   'forcedbins_filename': forcedbins_filename,

From 934b305422966ed6c8348e46290a13078544d7fb Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:19:58 +1000
Subject: [PATCH 14/49] Use stable sort.

---
 src/io/bin.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 2556a59b4715..b26a6a461e3e 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -213,7 +213,7 @@ namespace LightGBM {
     if (num_to_insert > 0) {
       bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
     }
-    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
 
     // find remaining bounds
     std::vector<double> bounds_to_add;
@@ -238,7 +238,7 @@ namespace LightGBM {
       bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
-    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
     CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }

From dc45bd1d10b508101133229696ec10be0e271ede Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Thu, 15 Aug 2019 19:17:19 +1000
Subject: [PATCH 15/49] Minor style and doc fixes.

---
 docs/Parameters.rst       | 2 +-
 include/LightGBM/config.h | 2 +-
 src/io/dataset_loader.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index c4f45f0010c4..28777637d100 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -408,7 +408,7 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
 
    -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 56903a9b96ae..b67ee9656468 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -403,7 +403,7 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
   // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index eb83d74bfe3d..c00b9b7fdae5 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,8 +2,8 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
 #include <LightGBM/dataset_loader.h>
+
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>

From 018182ceccdd02ef930f20e63284be2cadc1cb14 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 13 Aug 2019 18:14:54 +1000
Subject: [PATCH 16/49] Add functionality to force bin thresholds.

---
 docs/Parameters.rst                      |  4 +--
 include/LightGBM/config.h                |  4 +--
 src/io/bin.cpp                           |  6 ++--
 src/io/dataset.cpp                       | 39 +++++++++++-------------
 src/io/dataset_loader.cpp                |  1 -
 tests/data/forced_bins.json              | 10 ++++++
 tests/python_package_test/test_engine.py |  3 +-
 7 files changed, 36 insertions(+), 31 deletions(-)
 create mode 100644 tests/data/forced_bins.json

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 28777637d100..10105bfbed5a 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -408,9 +408,9 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
 
-   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
 
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index b67ee9656468..d2a953ddb416 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -403,8 +403,8 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
-  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
+  // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
   // check = >=0.0
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index b26a6a461e3e..62713d1bddd3 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -213,7 +213,7 @@ namespace LightGBM {
     if (num_to_insert > 0) {
       bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
     }
-    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
 
     // find remaining bounds
     std::vector<double> bounds_to_add;
@@ -238,7 +238,7 @@ namespace LightGBM {
       bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
-    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
     CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }
@@ -320,7 +320,7 @@ namespace LightGBM {
         }
       } else if (missing_type_ == MissingType::None) {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
-                                                   min_data_in_bin, forced_upper_bounds);
+                                                  min_data_in_bin, forced_upper_bounds);
       } else {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
                                                    min_data_in_bin, forced_upper_bounds);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 269c06c4c37d..e948754034be 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -9,6 +9,7 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
+#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
@@ -1071,29 +1072,25 @@ std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_
   std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
   if (forced_bins_path != "") {
     std::ifstream forced_bins_stream(forced_bins_path.c_str());
-    if (forced_bins_stream.fail()) {
-      Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str());
-    } else {
-      std::stringstream buffer;
-      buffer << forced_bins_stream.rdbuf();
-      std::string err;
-      Json forced_bins_json = Json::parse(buffer.str(), err);
-      CHECK(forced_bins_json.is_array());
-      std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
-      for (int i = 0; i < forced_bins_arr.size(); ++i) {
-        int feature_num = forced_bins_arr[i]["feature"].int_value();
-        CHECK(feature_num < num_total_features);
-        std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
-        for (int j = 0; j < bounds_arr.size(); ++j) {
-          forced_bins[feature_num].push_back(bounds_arr[j].number_value());
-        }
-      }
-      // remove duplicates
-      for (int i = 0; i < num_total_features; ++i) {
-        auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
-        forced_bins[i].erase(new_end, forced_bins[i].end());
+    std::stringstream buffer;
+    buffer << forced_bins_stream.rdbuf();
+    std::string err;
+    Json forced_bins_json = Json::parse(buffer.str(), err);
+    CHECK(forced_bins_json.is_array());
+    std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+    for (int i = 0; i < forced_bins_arr.size(); ++i) {
+      int feature_num = forced_bins_arr[i]["feature"].int_value();
+      CHECK(feature_num < num_total_features);
+      std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+      for (int j = 0; j < bounds_arr.size(); ++j) {
+        forced_bins[feature_num].push_back(bounds_arr[j].number_value());
       }
     }
+    // remove duplicates
+    for (int i = 0; i < num_total_features; ++i) {
+      auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+      forced_bins[i].erase(new_end, forced_bins[i].end());
+    }
   }
   return forced_bins;
 }
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index c00b9b7fdae5..f36d5b1df27d 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -3,7 +3,6 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #include <LightGBM/dataset_loader.h>
-
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>
diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json
new file mode 100644
index 000000000000..aa74c36ffb78
--- /dev/null
+++ b/tests/data/forced_bins.json
@@ -0,0 +1,10 @@
+[
+    {
+        "feature": 0,
+        "bin_upper_bound": [ 0.3, 0.35, 0.4 ]
+    },
+    {
+        "feature": 1,
+        "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
+    }
+]
\ No newline at end of file
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 2420ee9ec853..4eb1e2cb8e38 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1596,8 +1596,7 @@ def test_forced_bins(self):
         x[:, 0] = np.arange(0, 1, 0.01)
         x[:, 1] = -np.arange(0, 1, 0.01)
         y = np.arange(0, 1, 0.01)
-        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                           '../../examples/regression/forced_bins.json')
+        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json')
         params = {'objective': 'regression_l1',
                   'max_bin': 6,
                   'forcedbins_filename': forcedbins_filename,

From 7a4df5117deca7c7ff457b07a6cbbc3807103d4c Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:10:21 +1000
Subject: [PATCH 17/49] Fix style issues.

---
 docs/Parameters.rst                      |  4 +--
 include/LightGBM/config.h                |  4 +--
 src/io/bin.cpp                           |  2 +-
 src/io/dataset.cpp                       | 39 +++++++++++++-----------
 src/io/dataset_loader.cpp                |  1 +
 tests/data/forced_bins.json              | 10 ------
 tests/python_package_test/test_engine.py |  3 +-
 7 files changed, 29 insertions(+), 34 deletions(-)
 delete mode 100644 tests/data/forced_bins.json

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 10105bfbed5a..c4f45f0010c4 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -408,9 +408,9 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
 
-   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
 
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index d2a953ddb416..56903a9b96ae 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -403,8 +403,8 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
-  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
   // check = >=0.0
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 62713d1bddd3..2556a59b4715 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -320,7 +320,7 @@ namespace LightGBM {
         }
       } else if (missing_type_ == MissingType::None) {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
-                                                  min_data_in_bin, forced_upper_bounds);
+                                                   min_data_in_bin, forced_upper_bounds);
       } else {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
                                                    min_data_in_bin, forced_upper_bounds);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index e948754034be..269c06c4c37d 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -9,7 +9,6 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
-#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
@@ -1072,24 +1071,28 @@ std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_
   std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
   if (forced_bins_path != "") {
     std::ifstream forced_bins_stream(forced_bins_path.c_str());
-    std::stringstream buffer;
-    buffer << forced_bins_stream.rdbuf();
-    std::string err;
-    Json forced_bins_json = Json::parse(buffer.str(), err);
-    CHECK(forced_bins_json.is_array());
-    std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
-    for (int i = 0; i < forced_bins_arr.size(); ++i) {
-      int feature_num = forced_bins_arr[i]["feature"].int_value();
-      CHECK(feature_num < num_total_features);
-      std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
-      for (int j = 0; j < bounds_arr.size(); ++j) {
-        forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+    if (forced_bins_stream.fail()) {
+      Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str());
+    } else {
+      std::stringstream buffer;
+      buffer << forced_bins_stream.rdbuf();
+      std::string err;
+      Json forced_bins_json = Json::parse(buffer.str(), err);
+      CHECK(forced_bins_json.is_array());
+      std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+      for (int i = 0; i < forced_bins_arr.size(); ++i) {
+        int feature_num = forced_bins_arr[i]["feature"].int_value();
+        CHECK(feature_num < num_total_features);
+        std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+        for (int j = 0; j < bounds_arr.size(); ++j) {
+          forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+        }
+      }
+      // remove duplicates
+      for (int i = 0; i < num_total_features; ++i) {
+        auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+        forced_bins[i].erase(new_end, forced_bins[i].end());
       }
-    }
-    // remove duplicates
-    for (int i = 0; i < num_total_features; ++i) {
-      auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
-      forced_bins[i].erase(new_end, forced_bins[i].end());
     }
   }
   return forced_bins;
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index f36d5b1df27d..eb83d74bfe3d 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,6 +2,7 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
 #include <LightGBM/dataset_loader.h>
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json
deleted file mode 100644
index aa74c36ffb78..000000000000
--- a/tests/data/forced_bins.json
+++ /dev/null
@@ -1,10 +0,0 @@
-[
-    {
-        "feature": 0,
-        "bin_upper_bound": [ 0.3, 0.35, 0.4 ]
-    },
-    {
-        "feature": 1,
-        "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
-    }
-]
\ No newline at end of file
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 4eb1e2cb8e38..2420ee9ec853 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1596,7 +1596,8 @@ def test_forced_bins(self):
         x[:, 0] = np.arange(0, 1, 0.01)
         x[:, 1] = -np.arange(0, 1, 0.01)
         y = np.arange(0, 1, 0.01)
-        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json')
+        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                           '../../examples/regression/forced_bins.json')
         params = {'objective': 'regression_l1',
                   'max_bin': 6,
                   'forcedbins_filename': forcedbins_filename,

From 6095148a9a2a18690d096038250dae4c2cc5c183 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:19:58 +1000
Subject: [PATCH 18/49] Use stable sort.

---
 src/io/bin.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 2556a59b4715..b26a6a461e3e 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -213,7 +213,7 @@ namespace LightGBM {
     if (num_to_insert > 0) {
       bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
     }
-    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
 
     // find remaining bounds
     std::vector<double> bounds_to_add;
@@ -238,7 +238,7 @@ namespace LightGBM {
       bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
-    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
     CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }

From 8b57a56b65b5b1cc8b062145bc6380db7d73c678 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Thu, 15 Aug 2019 19:17:19 +1000
Subject: [PATCH 19/49] Minor style and doc fixes.

---
 docs/Parameters.rst       | 2 +-
 include/LightGBM/config.h | 2 +-
 src/io/dataset_loader.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index c4f45f0010c4..28777637d100 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -408,7 +408,7 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
 
    -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 56903a9b96ae..b67ee9656468 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -403,7 +403,7 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
   // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index eb83d74bfe3d..c00b9b7fdae5 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,8 +2,8 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
 #include <LightGBM/dataset_loader.h>
+
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>

From de83a69e65802bc64c280cdc55ec6503025b3a1f Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 20 Aug 2019 21:26:06 +1000
Subject: [PATCH 20/49] Change binning behavior to be same as PR #2342.

---
 src/io/bin.cpp                           | 14 +++++++----
 tests/python_package_test/test_engine.py | 31 +++++++++++++++++++++---
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index b26a6a461e3e..40da30c6ad2d 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -186,7 +186,7 @@ namespace LightGBM {
       }
     }
 
-    // include zero bounds if possible
+    // include zero bounds and infinity bound
     if (max_bin == 2) {
       if (left_cnt == 0) {
         bin_upper_bound.push_back(kZeroThreshold);
@@ -194,9 +194,14 @@ namespace LightGBM {
         bin_upper_bound.push_back(-kZeroThreshold);
       }
     } else if (max_bin >= 3) {
-      bin_upper_bound.push_back(-kZeroThreshold);
-      bin_upper_bound.push_back(kZeroThreshold);
+      if (left_cnt > 0) {
+        bin_upper_bound.push_back(-kZeroThreshold);
+      }
+      if (right_start >= 0) {
+        bin_upper_bound.push_back(kZeroThreshold);
+      }
     }
+    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
     
     // add forced bounds, excluding zeros since we have already added zero bounds
     int i = 0;
@@ -207,7 +212,6 @@ namespace LightGBM {
         ++i;
       }
     }
-    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
     int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
     int num_to_insert = std::min(max_to_insert, static_cast<int>(forced_upper_bounds.size()));
     if (num_to_insert > 0) {
@@ -239,7 +243,7 @@ namespace LightGBM {
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
     std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
-    CHECK(bin_upper_bound.size() <= max_bin);
+    CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin));
     return bin_upper_bound;
   }
 
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 2420ee9ec853..9f807d64b102 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -921,7 +921,7 @@ def test_max_bin_by_feature(self):
         }
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)
-        self.assertEqual(len(np.unique(est.predict(X))), 99)
+        self.assertEqual(len(np.unique(est.predict(X))), 100)
         params['max_bin_by_feature'] = [2, 100]
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)
@@ -1599,7 +1599,7 @@ def test_forced_bins(self):
         forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                            '../../examples/regression/forced_bins.json')
         params = {'objective': 'regression_l1',
-                  'max_bin': 6,
+                  'max_bin': 5,
                   'forcedbins_filename': forcedbins_filename,
                   'num_leaves': 2,
                   'min_data_in_leaf': 1,
@@ -1613,7 +1613,7 @@ def test_forced_bins(self):
         predicted = est.predict(new_x)
         self.assertEqual(len(np.unique(predicted)), 3)
         new_x[:, 0] = [0, 0, 0]
-        new_x[:, 1] = [-0.25, -0.5, -0.9]
+        new_x[:, 1] = [-0.9, -0.6, -0.3]
         predicted = est.predict(new_x)
         self.assertEqual(len(np.unique(predicted)), 1)
         params['forcedbins_filename'] = ''
@@ -1621,3 +1621,28 @@ def test_forced_bins(self):
         est = lgb.train(params, lgb_x, num_boost_round=100)
         predicted = est.predict(new_x)
         self.assertEqual(len(np.unique(predicted)), 3)
+
+    def test_binning_same_sign(self):
+        # test that binning works properly for features with only positive or only negative values
+        x = np.zeros((99, 2))
+        x[:, 0] = np.arange(0.01, 1, 0.01)
+        x[:, 1] = -np.arange(0.01, 1, 0.01)
+        y = np.arange(0.01, 1, 0.01)
+        params = {'objective': 'regression_l1',
+                  'max_bin': 5,
+                  'num_leaves': 2,
+                  'min_data_in_leaf': 1,
+                  'verbose': -1,
+                  'seed': 0}
+        lgb_x = lgb.Dataset(x, label=y)
+        est = lgb.train(params, lgb_x, num_boost_round=100)
+        new_x = np.zeros((3, 2))
+        new_x[:, 0] = [-1, 0, 1]
+        predicted = est.predict(new_x)
+        self.assertAlmostEqual(predicted[0], predicted[1])
+        self.assertNotAlmostEqual(predicted[1], predicted[2])
+        new_x = np.zeros((3, 2))
+        new_x[:, 1] = [-1, 0, 1]
+        predicted = est.predict(new_x)
+        self.assertNotAlmostEqual(predicted[0], predicted[1])
+        self.assertAlmostEqual(predicted[1], predicted[2])

From c4787757b33c5ecec6d574b1a7b2ef133c0d4f89 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 13 Aug 2019 18:14:54 +1000
Subject: [PATCH 21/49] Add functionality to force bin thresholds.

---
 docs/Parameters.rst                      |  8 +++
 include/LightGBM/bin.h                   |  3 +-
 include/LightGBM/config.h                |  5 ++
 include/LightGBM/dataset.h               |  3 ++
 src/io/bin.cpp                           | 21 ++++----
 src/io/config_auto.cpp                   |  4 ++
 src/io/dataset.cpp                       | 64 +++++++++++++++++++++++-
 src/io/dataset_loader.cpp                | 46 +++++++++++++----
 tests/data/forced_bins.json              | 10 ++++
 tests/python_package_test/test_engine.py |  2 +-
 10 files changed, 145 insertions(+), 21 deletions(-)
 create mode 100644 tests/data/forced_bins.json

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index aaa10eef347b..1fd11c94bd73 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -412,6 +412,14 @@ Learning Control Parameters
 
    -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
 
+-  ``forcedbins_filename`` :raw-html:`<a id="forcedbins_filename" title="Permalink to this parameter" href="#forcedbins_filename">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string
+
+   -  path to a ``.json`` file that specifies bin upper bounds for some or all features
+
+   -  ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
    -  decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index 46baee58fc46..1c5f62cd1907 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -146,9 +146,10 @@ class BinMapper {
   * \param bin_type Type of this bin
   * \param use_missing True to enable missing value handle
   * \param zero_as_missing True to use zero as missing value
+  * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
   */
   void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
-               bool use_missing, bool zero_as_missing);
+               bool use_missing, bool zero_as_missing, std::vector<double> forced_upper_bounds);
 
   /*!
   * \brief Use specific number of bin to calculate the size of this class
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 3e1a6c4f0bd6..049b0bf1f8df 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -408,6 +408,11 @@ struct Config {
   // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
   std::string forcedsplits_filename = "";
 
+  // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
+  // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+  std::string forcedbins_filename = "";
+
   // check = >=0.0
   // check = <=1.0
   // desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index e688522fbb1a..900487eafbf4 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -596,6 +596,8 @@ class Dataset {
 
   void addFeaturesFrom(Dataset* other);
 
+  static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features);
+
  private:
   std::string data_filename_;
   /*! \brief Store used features */
@@ -630,6 +632,7 @@ class Dataset {
   bool is_finish_load_;
   int max_bin_;
   std::vector<int32_t> max_bin_by_feature_;
+  std::vector<std::vector<double>> forced_bin_bounds_;
   int bin_construct_sample_cnt_;
   int min_data_in_bin_;
   bool use_missing_;
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 2e79a80266b6..5c41edaad9b2 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -150,8 +150,10 @@ namespace LightGBM {
   }
 
   std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
-    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
+    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double> forced_upper_bounds) {
     std::vector<double> bin_upper_bound;
+
+    // get list of distinct values
     int left_cnt_data = 0;
     int cnt_zero = 0;
     int right_cnt_data = 0;
@@ -165,6 +167,7 @@ namespace LightGBM {
       }
     }
 
+    // get number of positive and negative distinct values
     int left_cnt = -1;
     for (int i = 0; i < num_distinct_values; ++i) {
       if (distinct_values[i] > -kZeroThreshold) {
@@ -172,7 +175,6 @@ namespace LightGBM {
         break;
       }
     }
-
     if (left_cnt < 0) {
       left_cnt = num_distinct_values;
     }
@@ -199,16 +201,14 @@ namespace LightGBM {
       auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
         num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
       bin_upper_bound.push_back(kZeroThreshold);
-      bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
-    } else {
-      bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
     }
     CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin));
     return bin_upper_bound;
   }
 
   void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
-    int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
+    int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, 
+    std::vector<double> forced_upper_bounds) {
     int na_cnt = 0;
     int tmp_num_sample_values = 0;
     for (int i = 0; i < num_sample_values; ++i) {
@@ -276,14 +276,17 @@ namespace LightGBM {
     int num_distinct_values = static_cast<int>(distinct_values.size());
     if (bin_type_ == BinType::NumericalBin) {
       if (missing_type_ == MissingType::Zero) {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
+                                                   min_data_in_bin, forced_upper_bounds);
         if (bin_upper_bound_.size() == 2) {
           missing_type_ = MissingType::None;
         }
       } else if (missing_type_ == MissingType::None) {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
+                                                  min_data_in_bin, forced_upper_bounds);
       } else {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
+                                                   min_data_in_bin, forced_upper_bounds);
         bin_upper_bound_.push_back(NaN);
       }
       num_bin_ = static_cast<int>(bin_upper_bound_.size());
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index b2957cb6335b..aaafe6d4507c 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -214,6 +214,7 @@ std::unordered_set<std::string> Config::parameter_set({
   "monotone_constraints",
   "feature_contri",
   "forcedsplits_filename",
+  "forcedbins_filename",
   "refit_decay_rate",
   "cegb_tradeoff",
   "cegb_penalty_split",
@@ -402,6 +403,8 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetString(params, "forcedsplits_filename", &forcedsplits_filename);
 
+  GetString(params, "forcedbins_filename", &forcedbins_filename);
+
   GetDouble(params, "refit_decay_rate", &refit_decay_rate);
   CHECK(refit_decay_rate >=0.0);
   CHECK(refit_decay_rate <=1.0);
@@ -617,6 +620,7 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast<int8_t, int>(monotone_constraints), ",") << "]\n";
   str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n";
   str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
+  str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n";
   str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n";
   str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n";
   str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index f201a40a1a7a..c931e945cd24 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -8,12 +8,17 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
+#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
 #include <cstdio>
 #include <sstream>
 #include <unordered_map>
+#include <fstream>
+
+using namespace json11;
+
 
 namespace LightGBM {
 
@@ -324,6 +329,7 @@ void Dataset::Construct(
     max_bin_by_feature_.resize(num_total_features_);
     max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end());
   }
+  forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_);
   max_bin_ = io_config.max_bin;
   min_data_in_bin_ = io_config.min_data_in_bin;
   bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
@@ -356,6 +362,12 @@ void Dataset::ResetConfig(const char* parameters) {
   if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
     Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
   }
+  if (param.count("forcedbins_filename")) {
+    std::vector<std::vector<double>> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_);
+    if (config_bounds != forced_bin_bounds_) {
+      Log::Warning("Cannot change forced bins after constructed Dataset handle.");
+    }
+  }
 
   if (!io_config.monotone_constraints.empty()) {
     CHECK(static_cast<size_t>(num_total_features_) == io_config.monotone_constraints.size());
@@ -657,6 +669,10 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
     for (int i = 0; i < num_total_features_; ++i) {
       size_of_header += feature_names_[i].size() + sizeof(int);
     }
+    // size of forced bins
+    for (int i = 0; i < num_total_features_; ++i) {
+      size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int);
+    }
     writer->Write(&size_of_header, sizeof(size_of_header));
     // write header
     writer->Write(&num_data_, sizeof(num_data_));
@@ -705,6 +721,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
       const char* c_str = feature_names_[i].c_str();
       writer->Write(c_str, sizeof(char) * str_len);
     }
+    // write forced bins
+    for (int i = 0; i < num_total_features_; ++i) {
+      int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
+      writer->Write(&num_bounds, sizeof(int));
+      
+      for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) {
+        writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
+      }
+    }
 
     // get size of meta data
     size_t size_of_metadata = metadata_.SizesInByte();
@@ -754,6 +779,13 @@ void Dataset::DumpTextFile(const char* text_filename) {
   for (auto n : feature_names_) {
     fprintf(file, "%s, ", n.c_str());
   }
+  fprintf(file, "\nforced_bins: ");
+  for (int i = 0; i < num_total_features_; ++i) {
+    fprintf(file, "\nfeature %d: ", i);
+    for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) {
+      fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
+    }
+  }
   std::vector<std::unique_ptr<BinIterator>> iterators;
   iterators.reserve(num_features_);
   for (int j = 0; j < num_features_; ++j) {
@@ -1005,6 +1037,7 @@ void Dataset::addFeaturesFrom(Dataset* other) {
   PushVector(feature_names_, other->feature_names_);
   PushVector(feature2subfeature_, other->feature2subfeature_);
   PushVector(group_feature_cnt_, other->group_feature_cnt_);
+  PushVector(forced_bin_bounds_, other->forced_bin_bounds_);
   feature_groups_.reserve(other->feature_groups_.size());
   for (auto& fg : other->feature_groups_) {
     feature_groups_.emplace_back(new FeatureGroup(*fg));
@@ -1027,10 +1060,39 @@ void Dataset::addFeaturesFrom(Dataset* other) {
 
   PushClearIfEmpty(monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
   PushClearIfEmpty(feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);
-
+  PushClearIfEmpty(max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1);
   num_features_ += other->num_features_;
   num_total_features_ += other->num_total_features_;
   num_groups_ += other->num_groups_;
 }
 
+
+std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features) {
+  std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
+  if (forced_bins_path != "") {
+    std::ifstream forced_bins_stream(forced_bins_path.c_str());
+    std::stringstream buffer;
+    buffer << forced_bins_stream.rdbuf();
+    std::string err;
+    Json forced_bins_json = Json::parse(buffer.str(), err);
+    CHECK(forced_bins_json.is_array());
+    std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+    for (int i = 0; i < forced_bins_arr.size(); ++i) {
+      int feature_num = forced_bins_arr[i]["feature"].int_value();
+      CHECK(feature_num < num_total_features);
+      std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+      for (int j = 0; j < bounds_arr.size(); ++j) {
+        forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+      }
+    }
+    // remove duplicates
+    for (int i = 0; i < num_total_features; ++i) {
+      auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+      forced_bins[i].erase(new_end, forced_bins[i].end());
+    }
+  }
+  return forced_bins;
+}
+
+
 }  // namespace LightGBM
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index ee47bece8fa5..bdfe3b0b4dfc 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -3,7 +3,6 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #include <LightGBM/dataset_loader.h>
-
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>
@@ -458,6 +457,21 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
     }
     dataset->feature_names_.emplace_back(str_buf.str());
   }
+  // get forced_bin_bounds_
+  dataset->forced_bin_bounds_ = std::vector<std::vector<double>>(dataset->num_total_features_, std::vector<double>());
+  for (int i = 0; i < dataset->num_total_features_; ++i) {
+    int num_bounds = *(reinterpret_cast<const int*>(mem_ptr));
+    mem_ptr += sizeof(int);
+    dataset->forced_bin_bounds_[i] = std::vector<double>();
+    const double* tmp_ptr_forced_bounds = reinterpret_cast<const double*>(mem_ptr);
+    
+    for (int j = 0; j < num_bounds; ++j) {
+      double bound = tmp_ptr_forced_bounds[j];
+      dataset->forced_bin_bounds_[i].push_back(bound);
+    }
+    mem_ptr += num_bounds * sizeof(double);
+   
+  }
 
   // read size of meta data
   read_cnt = reader->Read(buffer.data(), sizeof(size_t));
@@ -549,6 +563,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
   return dataset.release();
 }
 
+
 Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
                                                int** sample_indices, int num_col, const int* num_per_col,
                                                size_t total_sample_size, data_size_t num_data) {
@@ -565,6 +580,11 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
     CHECK(static_cast<size_t>(num_col) == config_.max_bin_by_feature.size());
     CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
   }
+
+  // get forced split
+  std::string forced_bins_path = config_.forcedbins_filename;
+  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col);
+
   const data_size_t filter_cnt = static_cast<data_size_t>(
     static_cast<double>(config_.min_data_in_leaf * total_sample_size) / num_data);
   if (Network::num_machines() == 1) {
@@ -589,12 +609,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
       if (config_.max_bin_by_feature.empty()) {
         bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
                                 config_.max_bin, config_.min_data_in_bin, filter_cnt,
-                                bin_type, config_.use_missing, config_.zero_as_missing);
+                                bin_type, config_.use_missing, config_.zero_as_missing,
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
                                 config_.max_bin_by_feature[i], config_.min_data_in_bin,
                                 filter_cnt, bin_type, config_.use_missing,
-                                config_.zero_as_missing);
+                                config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
@@ -634,12 +655,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
       if (config_.max_bin_by_feature.empty()) {
         bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
                                 total_sample_size, config_.max_bin, config_.min_data_in_bin,
-                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
+                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, 
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
                                 total_sample_size, config_.max_bin_by_feature[start[rank] + i],
                                 config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
-                                config_.zero_as_missing);
+                                config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
@@ -876,6 +898,10 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
     CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
   }
 
+  // get forced split
+  std::string forced_bins_path = config_.forcedbins_filename;
+  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_);
+
   // check the range of label_idx, weight_idx and group_idx
   CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_);
   CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_);
@@ -913,12 +939,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
       if (config_.max_bin_by_feature.empty()) {
         bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
                                 sample_data.size(), config_.max_bin, config_.min_data_in_bin,
-                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
+                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing,
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
                                 sample_data.size(), config_.max_bin_by_feature[i],
                                 config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
-                                config_.zero_as_missing);
+                                config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
@@ -959,13 +986,14 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
         bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
                                 static_cast<int>(sample_values[start[rank] + i].size()),
                                 sample_data.size(), config_.max_bin, config_.min_data_in_bin,
-                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
+                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, 
+                                forced_bin_bounds[i]);
       } else {
         bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
                                 static_cast<int>(sample_values[start[rank] + i].size()),
                                 sample_data.size(), config_.max_bin_by_feature[i],
                                 config_.min_data_in_bin, filter_cnt, bin_type,
-                                config_.use_missing, config_.zero_as_missing);
+                                config_.use_missing, config_.zero_as_missing, forced_bin_bounds[i]);
       }
       OMP_LOOP_EX_END();
     }
diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json
new file mode 100644
index 000000000000..aa74c36ffb78
--- /dev/null
+++ b/tests/data/forced_bins.json
@@ -0,0 +1,10 @@
+[
+    {
+        "feature": 0,
+        "bin_upper_bound": [ 0.3, 0.35, 0.4 ]
+    },
+    {
+        "feature": 1,
+        "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
+    }
+]
\ No newline at end of file
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 63f1468132a5..4c60a23ba4ea 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -915,7 +915,7 @@ def test_max_bin_by_feature(self):
         }
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)
-        self.assertEqual(len(np.unique(est.predict(X))), 100)
+        self.assertEqual(len(np.unique(est.predict(X))), 99)
         params['max_bin_by_feature'] = [2, 100]
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)

From e3f183572fb35f2fcc73144eeb67a8106ea72eca Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:10:21 +1000
Subject: [PATCH 22/49] Fix style issues.

---
 docs/Parameters.rst                           |  4 +-
 .../regression}/forced_bins.json              |  2 +-
 examples/regression/train.conf                |  3 ++
 include/LightGBM/config.h                     |  4 +-
 src/io/bin.cpp                                |  2 +-
 src/io/dataset.cpp                            | 40 ++++++++++---------
 src/io/dataset_loader.cpp                     |  1 +
 7 files changed, 32 insertions(+), 24 deletions(-)
 rename {tests/data => examples/regression}/forced_bins.json (98%)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 1fd11c94bd73..e33b36eb944e 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -416,9 +416,9 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
 
-   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
 
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
diff --git a/tests/data/forced_bins.json b/examples/regression/forced_bins.json
similarity index 98%
rename from tests/data/forced_bins.json
rename to examples/regression/forced_bins.json
index aa74c36ffb78..1ee0a49d727c 100644
--- a/tests/data/forced_bins.json
+++ b/examples/regression/forced_bins.json
@@ -7,4 +7,4 @@
         "feature": 1,
         "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
     }
-]
\ No newline at end of file
+]
diff --git a/examples/regression/train.conf b/examples/regression/train.conf
index 11396c23ecc2..4c73169dc8f9 100644
--- a/examples/regression/train.conf
+++ b/examples/regression/train.conf
@@ -29,6 +29,9 @@ is_training_metric = true
 # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
 max_bin = 255
 
+# forced bin thresholds
+# forcedbins_filename = forced_bins.json
+
 # training data
 # if exsting weight file, should name to "regression.train.weight"
 # alias: train_data, train
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 049b0bf1f8df..8e0f0608a282 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -409,8 +409,8 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
-  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
   // check = >=0.0
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 5c41edaad9b2..43ab1c8eacdb 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -283,7 +283,7 @@ namespace LightGBM {
         }
       } else if (missing_type_ == MissingType::None) {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
-                                                  min_data_in_bin, forced_upper_bounds);
+                                                   min_data_in_bin, forced_upper_bounds);
       } else {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
                                                    min_data_in_bin, forced_upper_bounds);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index c931e945cd24..269c06c4c37d 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -5,10 +5,10 @@
 #include <LightGBM/dataset.h>
 
 #include <LightGBM/feature_group.h>
+#include <LightGBM/json11.hpp>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
-#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
@@ -1071,24 +1071,28 @@ std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_
   std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
   if (forced_bins_path != "") {
     std::ifstream forced_bins_stream(forced_bins_path.c_str());
-    std::stringstream buffer;
-    buffer << forced_bins_stream.rdbuf();
-    std::string err;
-    Json forced_bins_json = Json::parse(buffer.str(), err);
-    CHECK(forced_bins_json.is_array());
-    std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
-    for (int i = 0; i < forced_bins_arr.size(); ++i) {
-      int feature_num = forced_bins_arr[i]["feature"].int_value();
-      CHECK(feature_num < num_total_features);
-      std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
-      for (int j = 0; j < bounds_arr.size(); ++j) {
-        forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+    if (forced_bins_stream.fail()) {
+      Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str());
+    } else {
+      std::stringstream buffer;
+      buffer << forced_bins_stream.rdbuf();
+      std::string err;
+      Json forced_bins_json = Json::parse(buffer.str(), err);
+      CHECK(forced_bins_json.is_array());
+      std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+      for (int i = 0; i < forced_bins_arr.size(); ++i) {
+        int feature_num = forced_bins_arr[i]["feature"].int_value();
+        CHECK(feature_num < num_total_features);
+        std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+        for (int j = 0; j < bounds_arr.size(); ++j) {
+          forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+        }
+      }
+      // remove duplicates
+      for (int i = 0; i < num_total_features; ++i) {
+        auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+        forced_bins[i].erase(new_end, forced_bins[i].end());
       }
-    }
-    // remove duplicates
-    for (int i = 0; i < num_total_features; ++i) {
-      auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
-      forced_bins[i].erase(new_end, forced_bins[i].end());
     }
   }
   return forced_bins;
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index bdfe3b0b4dfc..7a11957558c5 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,6 +2,7 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
 #include <LightGBM/dataset_loader.h>
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>

From 2280c568715f241fb5adbc533895483803878695 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Thu, 15 Aug 2019 19:17:19 +1000
Subject: [PATCH 23/49] Minor style and doc fixes.

---
 docs/Parameters.rst       | 2 +-
 include/LightGBM/config.h | 2 +-
 src/io/dataset_loader.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index e33b36eb944e..b971215dcde9 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -416,7 +416,7 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
 
    -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 8e0f0608a282..baba482c5e52 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -409,7 +409,7 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
   // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 7a11957558c5..6e60560a9be1 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,8 +2,8 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
 #include <LightGBM/dataset_loader.h>
+
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>

From 76fa4ccf2167d9337adfe9803b01025ae7a37b1f Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 13 Aug 2019 18:14:54 +1000
Subject: [PATCH 24/49] Add functionality to force bin thresholds.

---
 src/io/dataset.cpp          |  1 +
 src/io/dataset_loader.cpp   |  1 -
 tests/data/forced_bins.json | 10 ++++++++++
 3 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 tests/data/forced_bins.json

diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 269c06c4c37d..2e400387663e 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -9,6 +9,7 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
+#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 6e60560a9be1..bdfe3b0b4dfc 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -3,7 +3,6 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #include <LightGBM/dataset_loader.h>
-
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>
diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json
new file mode 100644
index 000000000000..aa74c36ffb78
--- /dev/null
+++ b/tests/data/forced_bins.json
@@ -0,0 +1,10 @@
+[
+    {
+        "feature": 0,
+        "bin_upper_bound": [ 0.3, 0.35, 0.4 ]
+    },
+    {
+        "feature": 1,
+        "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
+    }
+]
\ No newline at end of file

From 93d92ebc94a2cb1a9ced2a175945f01eecb9f8ae Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:10:21 +1000
Subject: [PATCH 25/49] Fix style issues.

---
 src/io/dataset.cpp          |  1 -
 src/io/dataset_loader.cpp   |  1 +
 tests/data/forced_bins.json | 10 ----------
 3 files changed, 1 insertion(+), 11 deletions(-)
 delete mode 100644 tests/data/forced_bins.json

diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 2e400387663e..269c06c4c37d 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -9,7 +9,6 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
-#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index bdfe3b0b4dfc..7a11957558c5 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,6 +2,7 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
 #include <LightGBM/dataset_loader.h>
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json
deleted file mode 100644
index aa74c36ffb78..000000000000
--- a/tests/data/forced_bins.json
+++ /dev/null
@@ -1,10 +0,0 @@
-[
-    {
-        "feature": 0,
-        "bin_upper_bound": [ 0.3, 0.35, 0.4 ]
-    },
-    {
-        "feature": 1,
-        "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
-    }
-]
\ No newline at end of file

From fec30a581974f858a7ed3900f062187c46576f7e Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Thu, 15 Aug 2019 19:17:19 +1000
Subject: [PATCH 26/49] Minor style and doc fixes.

---
 src/io/dataset_loader.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 7a11957558c5..6e60560a9be1 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,8 +2,8 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
 #include <LightGBM/dataset_loader.h>
+
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>

From 503e7b49e2ee2af65fb955bae2afb8b31a8cfd0d Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 20 Aug 2019 21:26:06 +1000
Subject: [PATCH 27/49] Change binning behavior to be same as PR #2342.

---
 tests/python_package_test/test_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 4c60a23ba4ea..63f1468132a5 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -915,7 +915,7 @@ def test_max_bin_by_feature(self):
         }
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)
-        self.assertEqual(len(np.unique(est.predict(X))), 99)
+        self.assertEqual(len(np.unique(est.predict(X))), 100)
         params['max_bin_by_feature'] = [2, 100]
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)

From eecb80c7ce475f671db4f5d64bd5427abe3d89a5 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 13 Aug 2019 18:14:54 +1000
Subject: [PATCH 28/49] Add functionality to force bin thresholds.

---
 src/io/bin.cpp                           | 67 ++++++++++++++++++------
 src/io/dataset.cpp                       |  1 +
 src/io/dataset_loader.cpp                |  1 -
 tests/data/forced_bins.json              | 10 ++++
 tests/python_package_test/test_engine.py |  2 +-
 5 files changed, 64 insertions(+), 17 deletions(-)
 create mode 100644 tests/data/forced_bins.json

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 43ab1c8eacdb..2556a59b4715 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -178,16 +178,6 @@ namespace LightGBM {
     if (left_cnt < 0) {
       left_cnt = num_distinct_values;
     }
-
-    if ((left_cnt > 0) && (max_bin > 1)) {
-      int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
-      left_max_bin = std::max(1, left_max_bin);
-      bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
-      if (bin_upper_bound.size() > 0) {
-        bin_upper_bound.back() = -kZeroThreshold;
-      }
-    }
-
     int right_start = -1;
     for (int i = left_cnt; i < num_distinct_values; ++i) {
       if (distinct_values[i] > kZeroThreshold) {
@@ -196,13 +186,60 @@ namespace LightGBM {
       }
     }
 
-    int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
-    if (right_start >= 0 && right_max_bin > 0) {
-      auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
-        num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
+    // include zero bounds if possible
+    if (max_bin == 2) {
+      if (left_cnt == 0) {
+        bin_upper_bound.push_back(kZeroThreshold);
+      } else {
+        bin_upper_bound.push_back(-kZeroThreshold);
+      }
+    } else if (max_bin >= 3) {
+      bin_upper_bound.push_back(-kZeroThreshold);
       bin_upper_bound.push_back(kZeroThreshold);
     }
-    CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin));
+    
+    // add forced bounds, excluding zeros since we have already added zero bounds
+    int i = 0;
+    while (i < forced_upper_bounds.size()) {
+      if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) {
+        forced_upper_bounds.erase(forced_upper_bounds.begin() + i);
+      } else {
+        ++i;
+      }
+    }
+    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
+    int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
+    int num_to_insert = std::min(max_to_insert, static_cast<int>(forced_upper_bounds.size()));
+    if (num_to_insert > 0) {
+      bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
+    }
+    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+
+    // find remaining bounds
+    std::vector<double> bounds_to_add;
+    int value_ind = 0;
+    for (int i = 0; i < bin_upper_bound.size(); ++i) {
+      int cnt_in_bin = 0;
+      int distinct_cnt_in_bin = 0;
+      int bin_start = value_ind;
+      while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) {
+        cnt_in_bin += counts[value_ind];
+        ++distinct_cnt_in_bin;
+        ++value_ind;
+      }
+      int bins_remaining = max_bin - static_cast<int>(bin_upper_bound.size()) - static_cast<int>(bounds_to_add.size());
+      int num_sub_bins = static_cast<int>(std::lround((static_cast<double>(cnt_in_bin) * bins_remaining / total_sample_cnt)));
+      num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1;
+      if (i == bin_upper_bound.size() - 1) {
+        num_sub_bins = bins_remaining + 1;
+      }
+      std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, 
+                                                            num_sub_bins, cnt_in_bin, min_data_in_bin);
+      bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
+    }
+    bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
+    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }
 
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 269c06c4c37d..2e400387663e 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -9,6 +9,7 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
+#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 6e60560a9be1..bdfe3b0b4dfc 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -3,7 +3,6 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #include <LightGBM/dataset_loader.h>
-
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>
diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json
new file mode 100644
index 000000000000..aa74c36ffb78
--- /dev/null
+++ b/tests/data/forced_bins.json
@@ -0,0 +1,10 @@
+[
+    {
+        "feature": 0,
+        "bin_upper_bound": [ 0.3, 0.35, 0.4 ]
+    },
+    {
+        "feature": 1,
+        "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
+    }
+]
\ No newline at end of file
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 63f1468132a5..4c60a23ba4ea 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -915,7 +915,7 @@ def test_max_bin_by_feature(self):
         }
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)
-        self.assertEqual(len(np.unique(est.predict(X))), 100)
+        self.assertEqual(len(np.unique(est.predict(X))), 99)
         params['max_bin_by_feature'] = [2, 100]
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)

From a02b3a3eaf91bdfcf163d7fd888e0184ecdadb45 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:10:21 +1000
Subject: [PATCH 29/49] Fix style issues.

---
 src/io/dataset.cpp          |  1 -
 src/io/dataset_loader.cpp   |  1 +
 tests/data/forced_bins.json | 10 ----------
 3 files changed, 1 insertion(+), 11 deletions(-)
 delete mode 100644 tests/data/forced_bins.json

diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 2e400387663e..269c06c4c37d 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -9,7 +9,6 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
-#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index bdfe3b0b4dfc..7a11957558c5 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,6 +2,7 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
 #include <LightGBM/dataset_loader.h>
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json
deleted file mode 100644
index aa74c36ffb78..000000000000
--- a/tests/data/forced_bins.json
+++ /dev/null
@@ -1,10 +0,0 @@
-[
-    {
-        "feature": 0,
-        "bin_upper_bound": [ 0.3, 0.35, 0.4 ]
-    },
-    {
-        "feature": 1,
-        "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
-    }
-]
\ No newline at end of file

From cb12379795b6307820620e0a98ad01c4cbf0ff5e Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:19:58 +1000
Subject: [PATCH 30/49] Use stable sort.

---
 src/io/bin.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 2556a59b4715..b26a6a461e3e 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -213,7 +213,7 @@ namespace LightGBM {
     if (num_to_insert > 0) {
       bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
     }
-    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
 
     // find remaining bounds
     std::vector<double> bounds_to_add;
@@ -238,7 +238,7 @@ namespace LightGBM {
       bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
-    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
     CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }

From abe95d787c34084ed431c84ebff2bde9797b0d2a Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Thu, 15 Aug 2019 19:17:19 +1000
Subject: [PATCH 31/49] Minor style and doc fixes.

---
 src/io/dataset_loader.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 7a11957558c5..6e60560a9be1 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,8 +2,8 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
 #include <LightGBM/dataset_loader.h>
+
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>

From 7aed6892e6d9f17a587e86d224e22248e8bffae6 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 13 Aug 2019 18:14:54 +1000
Subject: [PATCH 32/49] Add functionality to force bin thresholds.

---
 docs/Parameters.rst         |  4 ++--
 include/LightGBM/config.h   |  4 ++--
 src/io/bin.cpp              |  6 +++---
 src/io/dataset.cpp          | 39 +++++++++++++++++--------------------
 src/io/dataset_loader.cpp   |  1 -
 tests/data/forced_bins.json | 10 ++++++++++
 6 files changed, 35 insertions(+), 29 deletions(-)
 create mode 100644 tests/data/forced_bins.json

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index b971215dcde9..1fd11c94bd73 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -416,9 +416,9 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
 
-   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
 
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index baba482c5e52..049b0bf1f8df 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -409,8 +409,8 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
-  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
+  // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
   // check = >=0.0
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index b26a6a461e3e..62713d1bddd3 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -213,7 +213,7 @@ namespace LightGBM {
     if (num_to_insert > 0) {
       bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
     }
-    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
 
     // find remaining bounds
     std::vector<double> bounds_to_add;
@@ -238,7 +238,7 @@ namespace LightGBM {
       bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
-    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
     CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }
@@ -320,7 +320,7 @@ namespace LightGBM {
         }
       } else if (missing_type_ == MissingType::None) {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
-                                                   min_data_in_bin, forced_upper_bounds);
+                                                  min_data_in_bin, forced_upper_bounds);
       } else {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
                                                    min_data_in_bin, forced_upper_bounds);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 269c06c4c37d..e948754034be 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -9,6 +9,7 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
+#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
@@ -1071,29 +1072,25 @@ std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_
   std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
   if (forced_bins_path != "") {
     std::ifstream forced_bins_stream(forced_bins_path.c_str());
-    if (forced_bins_stream.fail()) {
-      Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str());
-    } else {
-      std::stringstream buffer;
-      buffer << forced_bins_stream.rdbuf();
-      std::string err;
-      Json forced_bins_json = Json::parse(buffer.str(), err);
-      CHECK(forced_bins_json.is_array());
-      std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
-      for (int i = 0; i < forced_bins_arr.size(); ++i) {
-        int feature_num = forced_bins_arr[i]["feature"].int_value();
-        CHECK(feature_num < num_total_features);
-        std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
-        for (int j = 0; j < bounds_arr.size(); ++j) {
-          forced_bins[feature_num].push_back(bounds_arr[j].number_value());
-        }
-      }
-      // remove duplicates
-      for (int i = 0; i < num_total_features; ++i) {
-        auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
-        forced_bins[i].erase(new_end, forced_bins[i].end());
+    std::stringstream buffer;
+    buffer << forced_bins_stream.rdbuf();
+    std::string err;
+    Json forced_bins_json = Json::parse(buffer.str(), err);
+    CHECK(forced_bins_json.is_array());
+    std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+    for (int i = 0; i < forced_bins_arr.size(); ++i) {
+      int feature_num = forced_bins_arr[i]["feature"].int_value();
+      CHECK(feature_num < num_total_features);
+      std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+      for (int j = 0; j < bounds_arr.size(); ++j) {
+        forced_bins[feature_num].push_back(bounds_arr[j].number_value());
       }
     }
+    // remove duplicates
+    for (int i = 0; i < num_total_features; ++i) {
+      auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+      forced_bins[i].erase(new_end, forced_bins[i].end());
+    }
   }
   return forced_bins;
 }
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 6e60560a9be1..bdfe3b0b4dfc 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -3,7 +3,6 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #include <LightGBM/dataset_loader.h>
-
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>
diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json
new file mode 100644
index 000000000000..aa74c36ffb78
--- /dev/null
+++ b/tests/data/forced_bins.json
@@ -0,0 +1,10 @@
+[
+    {
+        "feature": 0,
+        "bin_upper_bound": [ 0.3, 0.35, 0.4 ]
+    },
+    {
+        "feature": 1,
+        "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
+    }
+]
\ No newline at end of file

From 35ce38bd7eb48a7a7110e8f4206abd07622a26ee Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:10:21 +1000
Subject: [PATCH 33/49] Fix style issues.

---
 docs/Parameters.rst         |  4 ++--
 include/LightGBM/config.h   |  4 ++--
 src/io/bin.cpp              |  2 +-
 src/io/dataset.cpp          | 39 ++++++++++++++++++++-----------------
 src/io/dataset_loader.cpp   |  1 +
 tests/data/forced_bins.json | 10 ----------
 6 files changed, 27 insertions(+), 33 deletions(-)
 delete mode 100644 tests/data/forced_bins.json

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 1fd11c94bd73..e33b36eb944e 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -416,9 +416,9 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
 
-   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
 
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 049b0bf1f8df..8e0f0608a282 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -409,8 +409,8 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
-  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
   // check = >=0.0
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 62713d1bddd3..2556a59b4715 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -320,7 +320,7 @@ namespace LightGBM {
         }
       } else if (missing_type_ == MissingType::None) {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
-                                                  min_data_in_bin, forced_upper_bounds);
+                                                   min_data_in_bin, forced_upper_bounds);
       } else {
         bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
                                                    min_data_in_bin, forced_upper_bounds);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index e948754034be..269c06c4c37d 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -9,7 +9,6 @@
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
-#include <LightGBM/json11.hpp>
 
 #include <limits>
 #include <chrono>
@@ -1072,24 +1071,28 @@ std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_
   std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
   if (forced_bins_path != "") {
     std::ifstream forced_bins_stream(forced_bins_path.c_str());
-    std::stringstream buffer;
-    buffer << forced_bins_stream.rdbuf();
-    std::string err;
-    Json forced_bins_json = Json::parse(buffer.str(), err);
-    CHECK(forced_bins_json.is_array());
-    std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
-    for (int i = 0; i < forced_bins_arr.size(); ++i) {
-      int feature_num = forced_bins_arr[i]["feature"].int_value();
-      CHECK(feature_num < num_total_features);
-      std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
-      for (int j = 0; j < bounds_arr.size(); ++j) {
-        forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+    if (forced_bins_stream.fail()) {
+      Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str());
+    } else {
+      std::stringstream buffer;
+      buffer << forced_bins_stream.rdbuf();
+      std::string err;
+      Json forced_bins_json = Json::parse(buffer.str(), err);
+      CHECK(forced_bins_json.is_array());
+      std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+      for (int i = 0; i < forced_bins_arr.size(); ++i) {
+        int feature_num = forced_bins_arr[i]["feature"].int_value();
+        CHECK(feature_num < num_total_features);
+        std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+        for (int j = 0; j < bounds_arr.size(); ++j) {
+          forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+        }
+      }
+      // remove duplicates
+      for (int i = 0; i < num_total_features; ++i) {
+        auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+        forced_bins[i].erase(new_end, forced_bins[i].end());
       }
-    }
-    // remove duplicates
-    for (int i = 0; i < num_total_features; ++i) {
-      auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
-      forced_bins[i].erase(new_end, forced_bins[i].end());
     }
   }
   return forced_bins;
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index bdfe3b0b4dfc..7a11957558c5 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,6 +2,7 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
 #include <LightGBM/dataset_loader.h>
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
diff --git a/tests/data/forced_bins.json b/tests/data/forced_bins.json
deleted file mode 100644
index aa74c36ffb78..000000000000
--- a/tests/data/forced_bins.json
+++ /dev/null
@@ -1,10 +0,0 @@
-[
-    {
-        "feature": 0,
-        "bin_upper_bound": [ 0.3, 0.35, 0.4 ]
-    },
-    {
-        "feature": 1,
-        "bin_upper_bound": [ -0.1, -0.15, -0.2 ]
-    }
-]
\ No newline at end of file

From 28c046205332312519d049ff536ed26c34f8dd43 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 14 Aug 2019 20:19:58 +1000
Subject: [PATCH 34/49] Use stable sort.

---
 src/io/bin.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 2556a59b4715..b26a6a461e3e 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -213,7 +213,7 @@ namespace LightGBM {
     if (num_to_insert > 0) {
       bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
     }
-    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
 
     // find remaining bounds
     std::vector<double> bounds_to_add;
@@ -238,7 +238,7 @@ namespace LightGBM {
       bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
-    std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
+    std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
     CHECK(bin_upper_bound.size() <= max_bin);
     return bin_upper_bound;
   }

From 23dbb29f4e9631b9430592bea53cafa3c3372e60 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Thu, 15 Aug 2019 19:17:19 +1000
Subject: [PATCH 35/49] Minor style and doc fixes.

---
 docs/Parameters.rst       | 2 +-
 include/LightGBM/config.h | 2 +-
 src/io/dataset_loader.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index e33b36eb944e..b971215dcde9 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -416,7 +416,7 @@ Learning Control Parameters
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
 
-   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+   -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
 
    -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 8e0f0608a282..baba482c5e52 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -409,7 +409,7 @@ struct Config {
   std::string forcedsplits_filename = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
-  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning)
+  // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
   // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 7a11957558c5..6e60560a9be1 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -2,8 +2,8 @@
  * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-
 #include <LightGBM/dataset_loader.h>
+
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>

From 9ed04a336b9839366c93b66f050ad52917ed0b68 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 20 Aug 2019 21:26:06 +1000
Subject: [PATCH 36/49] Change binning behavior to be same as PR #2342.

---
 src/io/bin.cpp                           | 14 +++++++++-----
 tests/python_package_test/test_engine.py |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index b26a6a461e3e..40da30c6ad2d 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -186,7 +186,7 @@ namespace LightGBM {
       }
     }
 
-    // include zero bounds if possible
+    // include zero bounds and infinity bound
     if (max_bin == 2) {
       if (left_cnt == 0) {
         bin_upper_bound.push_back(kZeroThreshold);
@@ -194,9 +194,14 @@ namespace LightGBM {
         bin_upper_bound.push_back(-kZeroThreshold);
       }
     } else if (max_bin >= 3) {
-      bin_upper_bound.push_back(-kZeroThreshold);
-      bin_upper_bound.push_back(kZeroThreshold);
+      if (left_cnt > 0) {
+        bin_upper_bound.push_back(-kZeroThreshold);
+      }
+      if (right_start >= 0) {
+        bin_upper_bound.push_back(kZeroThreshold);
+      }
     }
+    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
     
     // add forced bounds, excluding zeros since we have already added zero bounds
     int i = 0;
@@ -207,7 +212,6 @@ namespace LightGBM {
         ++i;
       }
     }
-    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
     int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
     int num_to_insert = std::min(max_to_insert, static_cast<int>(forced_upper_bounds.size()));
     if (num_to_insert > 0) {
@@ -239,7 +243,7 @@ namespace LightGBM {
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
     std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
-    CHECK(bin_upper_bound.size() <= max_bin);
+    CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin));
     return bin_upper_bound;
   }
 
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 4c60a23ba4ea..63f1468132a5 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -915,7 +915,7 @@ def test_max_bin_by_feature(self):
         }
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)
-        self.assertEqual(len(np.unique(est.predict(X))), 99)
+        self.assertEqual(len(np.unique(est.predict(X))), 100)
         params['max_bin_by_feature'] = [2, 100]
         lgb_data = lgb.Dataset(X, label=y)
         est = lgb.train(params, lgb_data, num_boost_round=1)

From 51e93a9d4d3b5f8cdd18dd0f4111b69d90b6fa5d Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 11 Sep 2019 22:05:00 +1000
Subject: [PATCH 37/49] Use different bin finding function for predefined
 bounds.

---
 src/io/bin.cpp | 90 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 84 insertions(+), 6 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 40da30c6ad2d..6f9b7a471177 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -149,7 +149,69 @@ namespace LightGBM {
     return bin_upper_bound;
   }
 
+
   std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
+    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
+    std::vector<double> bin_upper_bound;
+    int left_cnt_data = 0;
+    int cnt_zero = 0;
+    int right_cnt_data = 0;
+    for (int i = 0; i < num_distinct_values; ++i) {
+      if (distinct_values[i] <= -kZeroThreshold) {
+        left_cnt_data += counts[i];
+      }
+      else if (distinct_values[i] > kZeroThreshold) {
+        right_cnt_data += counts[i];
+      }
+      else {
+        cnt_zero += counts[i];
+      }
+    }
+
+    int left_cnt = -1;
+    for (int i = 0; i < num_distinct_values; ++i) {
+      if (distinct_values[i] > -kZeroThreshold) {
+        left_cnt = i;
+        break;
+      }
+    }
+
+    if (left_cnt < 0) {
+      left_cnt = num_distinct_values;
+    }
+
+    if ((left_cnt > 0) && (max_bin > 1)) {
+      int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
+      left_max_bin = std::max(1, left_max_bin);
+      bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
+      if (bin_upper_bound.size() > 0) {
+        bin_upper_bound.back() = -kZeroThreshold;
+      }
+    }
+
+    int right_start = -1;
+    for (int i = left_cnt; i < num_distinct_values; ++i) {
+      if (distinct_values[i] > kZeroThreshold) {
+        right_start = i;
+        break;
+      }
+    }
+
+    int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
+    if (right_start >= 0 && right_max_bin > 0) {
+      auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
+        num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
+      bin_upper_bound.push_back(kZeroThreshold);
+      bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
+    }
+    else {
+      bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
+    }
+    CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin));
+    return bin_upper_bound;
+  }
+
+  std::vector<double> FindBinWithPredefinedBin(const double* distinct_values, const int* counts,
     int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double> forced_upper_bounds) {
     std::vector<double> bin_upper_bound;
 
@@ -317,17 +379,33 @@ namespace LightGBM {
     int num_distinct_values = static_cast<int>(distinct_values.size());
     if (bin_type_ == BinType::NumericalBin) {
       if (missing_type_ == MissingType::Zero) {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
-                                                   min_data_in_bin, forced_upper_bounds);
+        auto empty_vec = std::vector<double>();
+        if (forced_upper_bounds.size() == 0) {
+          bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
+                                                     min_data_in_bin);
+        } else {
+          bin_upper_bound_ = FindBinWithPredefinedBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
+                                                      min_data_in_bin, forced_upper_bounds);
+        }
         if (bin_upper_bound_.size() == 2) {
           missing_type_ = MissingType::None;
         }
       } else if (missing_type_ == MissingType::None) {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, 
-                                                   min_data_in_bin, forced_upper_bounds);
+        if (forced_upper_bounds.size() == 0) {
+          bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
+                                                     min_data_in_bin);
+        } else {
+          bin_upper_bound_ = FindBinWithPredefinedBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
+                                                      min_data_in_bin, forced_upper_bounds);
+        }
       } else {
-        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, 
-                                                   min_data_in_bin, forced_upper_bounds);
+        if (forced_upper_bounds.size() == 0) {
+          bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt,
+                                                     min_data_in_bin);
+        } else {
+          bin_upper_bound_ = FindBinWithPredefinedBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt,
+                                                      min_data_in_bin, forced_upper_bounds);
+        }
         bin_upper_bound_.push_back(NaN);
       }
       num_bin_ = static_cast<int>(bin_upper_bound_.size());

From 4e3355a4699aa1d5c49866c51e9c7aa27b203366 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Thu, 12 Sep 2019 19:21:00 +1000
Subject: [PATCH 38/49] Fix style issues.

---
 src/io/bin.cpp     | 13 ++++---------
 src/io/dataset.cpp |  4 ++--
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 6f9b7a471177..88da7991e9b0 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -149,7 +149,6 @@ namespace LightGBM {
     return bin_upper_bound;
   }
 
-
   std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
     int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
     std::vector<double> bin_upper_bound;
@@ -159,11 +158,9 @@ namespace LightGBM {
     for (int i = 0; i < num_distinct_values; ++i) {
       if (distinct_values[i] <= -kZeroThreshold) {
         left_cnt_data += counts[i];
-      }
-      else if (distinct_values[i] > kZeroThreshold) {
+      } else if (distinct_values[i] > kZeroThreshold) {
         right_cnt_data += counts[i];
-      }
-      else {
+      } else {
         cnt_zero += counts[i];
       }
     }
@@ -203,8 +200,7 @@ namespace LightGBM {
         num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
       bin_upper_bound.push_back(kZeroThreshold);
       bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
-    }
-    else {
+    } else {
       bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
     }
     CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin));
@@ -300,7 +296,7 @@ namespace LightGBM {
         num_sub_bins = bins_remaining + 1;
       }
       std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, 
-                                                            num_sub_bins, cnt_in_bin, min_data_in_bin);
+                                                           num_sub_bins, cnt_in_bin, min_data_in_bin);
       bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
@@ -379,7 +375,6 @@ namespace LightGBM {
     int num_distinct_values = static_cast<int>(distinct_values.size());
     if (bin_type_ == BinType::NumericalBin) {
       if (missing_type_ == MissingType::Zero) {
-        auto empty_vec = std::vector<double>();
         if (forced_upper_bounds.size() == 0) {
           bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
                                                      min_data_in_bin);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 269c06c4c37d..21977a660de0 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -10,12 +10,12 @@
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
 
-#include <limits>
 #include <chrono>
 #include <cstdio>
+#include <fstream>
+#include <limits>
 #include <sstream>
 #include <unordered_map>
-#include <fstream>
 
 using namespace json11;
 

From 821b2ab440541136f045a6261f9bfd48cf37fda3 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Thu, 12 Sep 2019 21:10:42 +1000
Subject: [PATCH 39/49] Minor refactoring, overload FindBinWithZeroAsOneBin.

---
 src/io/bin.cpp | 161 ++++++++++++++++++++++++-------------------------
 1 file changed, 78 insertions(+), 83 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 88da7991e9b0..8ca57b936b08 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -71,7 +71,7 @@ namespace LightGBM {
     return true;
   }
 
-  std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
+  std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts, 
     int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) {
     std::vector<double> bin_upper_bound;
     CHECK(max_bin > 0);
@@ -149,64 +149,6 @@ namespace LightGBM {
     return bin_upper_bound;
   }
 
-  std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
-    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
-    std::vector<double> bin_upper_bound;
-    int left_cnt_data = 0;
-    int cnt_zero = 0;
-    int right_cnt_data = 0;
-    for (int i = 0; i < num_distinct_values; ++i) {
-      if (distinct_values[i] <= -kZeroThreshold) {
-        left_cnt_data += counts[i];
-      } else if (distinct_values[i] > kZeroThreshold) {
-        right_cnt_data += counts[i];
-      } else {
-        cnt_zero += counts[i];
-      }
-    }
-
-    int left_cnt = -1;
-    for (int i = 0; i < num_distinct_values; ++i) {
-      if (distinct_values[i] > -kZeroThreshold) {
-        left_cnt = i;
-        break;
-      }
-    }
-
-    if (left_cnt < 0) {
-      left_cnt = num_distinct_values;
-    }
-
-    if ((left_cnt > 0) && (max_bin > 1)) {
-      int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
-      left_max_bin = std::max(1, left_max_bin);
-      bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
-      if (bin_upper_bound.size() > 0) {
-        bin_upper_bound.back() = -kZeroThreshold;
-      }
-    }
-
-    int right_start = -1;
-    for (int i = left_cnt; i < num_distinct_values; ++i) {
-      if (distinct_values[i] > kZeroThreshold) {
-        right_start = i;
-        break;
-      }
-    }
-
-    int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
-    if (right_start >= 0 && right_max_bin > 0) {
-      auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
-        num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
-      bin_upper_bound.push_back(kZeroThreshold);
-      bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
-    } else {
-      bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
-    }
-    CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin));
-    return bin_upper_bound;
-  }
-
   std::vector<double> FindBinWithPredefinedBin(const double* distinct_values, const int* counts,
     int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double> forced_upper_bounds) {
     std::vector<double> bin_upper_bound;
@@ -260,7 +202,7 @@ namespace LightGBM {
       }
     }
     bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
-    
+
     // add forced bounds, excluding zeros since we have already added zero bounds
     int i = 0;
     while (i < forced_upper_bounds.size()) {
@@ -295,8 +237,8 @@ namespace LightGBM {
       if (i == bin_upper_bound.size() - 1) {
         num_sub_bins = bins_remaining + 1;
       }
-      std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, 
-                                                           num_sub_bins, cnt_in_bin, min_data_in_bin);
+      std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin,
+        num_sub_bins, cnt_in_bin, min_data_in_bin);
       bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1);  // last bound is infinity
     }
     bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
@@ -305,6 +247,74 @@ namespace LightGBM {
     return bin_upper_bound;
   }
 
+  std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values, 
+    int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
+    std::vector<double> bin_upper_bound;
+    int left_cnt_data = 0;
+    int cnt_zero = 0;
+    int right_cnt_data = 0;
+    for (int i = 0; i < num_distinct_values; ++i) {
+      if (distinct_values[i] <= -kZeroThreshold) {
+        left_cnt_data += counts[i];
+      } else if (distinct_values[i] > kZeroThreshold) {
+        right_cnt_data += counts[i];
+      } else {
+        cnt_zero += counts[i];
+      }
+    }
+
+    int left_cnt = -1;
+    for (int i = 0; i < num_distinct_values; ++i) {
+      if (distinct_values[i] > -kZeroThreshold) {
+        left_cnt = i;
+        break;
+      }
+    }
+
+    if (left_cnt < 0) {
+      left_cnt = num_distinct_values;
+    }
+
+    if ((left_cnt > 0) && (max_bin > 1)) {
+      int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
+      left_max_bin = std::max(1, left_max_bin);
+      bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
+      if (bin_upper_bound.size() > 0) {
+        bin_upper_bound.back() = -kZeroThreshold;
+      }
+    }
+
+    int right_start = -1;
+    for (int i = left_cnt; i < num_distinct_values; ++i) {
+      if (distinct_values[i] > kZeroThreshold) {
+        right_start = i;
+        break;
+      }
+    }
+
+    int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
+    if (right_start >= 0 && right_max_bin > 0) {
+      auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
+        num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
+      bin_upper_bound.push_back(kZeroThreshold);
+      bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
+    } else {
+      bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
+    }
+    CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin));
+    return bin_upper_bound;
+  }
+
+  std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values,
+    int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double> forced_upper_bounds) {
+    if (forced_upper_bounds.empty()) {
+      return FindBinWithZeroAsOneBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
+    } else {
+      return FindBinWithPredefinedBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin,
+                                      forced_upper_bounds);
+    }
+  }
+
   void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
     int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, 
     std::vector<double> forced_upper_bounds) {
@@ -375,32 +385,17 @@ namespace LightGBM {
     int num_distinct_values = static_cast<int>(distinct_values.size());
     if (bin_type_ == BinType::NumericalBin) {
       if (missing_type_ == MissingType::Zero) {
-        if (forced_upper_bounds.size() == 0) {
-          bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
-                                                     min_data_in_bin);
-        } else {
-          bin_upper_bound_ = FindBinWithPredefinedBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
-                                                      min_data_in_bin, forced_upper_bounds);
-        }
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
+                                                   min_data_in_bin, forced_upper_bounds);
         if (bin_upper_bound_.size() == 2) {
           missing_type_ = MissingType::None;
         }
       } else if (missing_type_ == MissingType::None) {
-        if (forced_upper_bounds.size() == 0) {
-          bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
-                                                     min_data_in_bin);
-        } else {
-          bin_upper_bound_ = FindBinWithPredefinedBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
-                                                      min_data_in_bin, forced_upper_bounds);
-        }
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
+                                                   min_data_in_bin, forced_upper_bounds);
       } else {
-        if (forced_upper_bounds.size() == 0) {
-          bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt,
-                                                     min_data_in_bin);
-        } else {
-          bin_upper_bound_ = FindBinWithPredefinedBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt,
-                                                      min_data_in_bin, forced_upper_bounds);
-        }
+        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt,
+                                                   min_data_in_bin, forced_upper_bounds);
         bin_upper_bound_.push_back(NaN);
       }
       num_bin_ = static_cast<int>(bin_upper_bound_.size());

From 8a5244481e1547eb3e730ad65f35d917ca678343 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Fri, 13 Sep 2019 18:43:10 +1000
Subject: [PATCH 40/49] Fix style issues.

---
 src/io/bin.cpp     |  4 ++--
 src/io/dataset.cpp | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 8ca57b936b08..d5f0832961f6 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -204,7 +204,7 @@ namespace LightGBM {
     bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
 
     // add forced bounds, excluding zeros since we have already added zero bounds
-    int i = 0;
+    size_t i = 0;
     while (i < forced_upper_bounds.size()) {
       if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) {
         forced_upper_bounds.erase(forced_upper_bounds.begin() + i);
@@ -222,7 +222,7 @@ namespace LightGBM {
     // find remaining bounds
     std::vector<double> bounds_to_add;
     int value_ind = 0;
-    for (int i = 0; i < bin_upper_bound.size(); ++i) {
+    for (size_t i = 0; i < bin_upper_bound.size(); ++i) {
       int cnt_in_bin = 0;
       int distinct_cnt_in_bin = 0;
       int bin_start = value_ind;
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 21977a660de0..8048dafdec4e 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -10,10 +10,10 @@
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
 
+#include <limits>
 #include <chrono>
 #include <cstdio>
 #include <fstream>
-#include <limits>
 #include <sstream>
 #include <unordered_map>
 
@@ -726,7 +726,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
       int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
       writer->Write(&num_bounds, sizeof(int));
       
-      for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) {
+      for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
         writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
       }
     }
@@ -782,7 +782,7 @@ void Dataset::DumpTextFile(const char* text_filename) {
   fprintf(file, "\nforced_bins: ");
   for (int i = 0; i < num_total_features_; ++i) {
     fprintf(file, "\nfeature %d: ", i);
-    for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) {
+    for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
       fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
     }
   }
@@ -1080,11 +1080,11 @@ std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_
       Json forced_bins_json = Json::parse(buffer.str(), err);
       CHECK(forced_bins_json.is_array());
       std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
-      for (int i = 0; i < forced_bins_arr.size(); ++i) {
+      for (size_t i = 0; i < forced_bins_arr.size(); ++i) {
         int feature_num = forced_bins_arr[i]["feature"].int_value();
         CHECK(feature_num < num_total_features);
         std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
-        for (int j = 0; j < bounds_arr.size(); ++j) {
+        for (size_t j = 0; j < bounds_arr.size(); ++j) {
           forced_bins[feature_num].push_back(bounds_arr[j].number_value());
         }
       }

From c591e7b8c40de69cd27b2b00f683efd639216046 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 17 Sep 2019 18:06:56 +1000
Subject: [PATCH 41/49] Fix bug and add new test.

---
 examples/regression/forced_bins2.json    | 6 ++++++
 src/io/bin.cpp                           | 3 ++-
 tests/python_package_test/test_engine.py | 9 +++++++++
 3 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 examples/regression/forced_bins2.json

diff --git a/examples/regression/forced_bins2.json b/examples/regression/forced_bins2.json
new file mode 100644
index 000000000000..f4dca0ccaf34
--- /dev/null
+++ b/examples/regression/forced_bins2.json
@@ -0,0 +1,6 @@
+[
+    {
+        "feature": 0,
+        "bin_upper_bound": [ 0.19, 0.39, 0.59, 0.79 ]
+    }
+]
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index d5f0832961f6..23a19273bfbf 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -220,6 +220,7 @@ namespace LightGBM {
     std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
 
     // find remaining bounds
+    int free_bins = max_bin - static_cast<int>(bin_upper_bound.size());
     std::vector<double> bounds_to_add;
     int value_ind = 0;
     for (size_t i = 0; i < bin_upper_bound.size(); ++i) {
@@ -232,7 +233,7 @@ namespace LightGBM {
         ++value_ind;
       }
       int bins_remaining = max_bin - static_cast<int>(bin_upper_bound.size()) - static_cast<int>(bounds_to_add.size());
-      int num_sub_bins = static_cast<int>(std::lround((static_cast<double>(cnt_in_bin) * bins_remaining / total_sample_cnt)));
+      int num_sub_bins = static_cast<int>(std::lround((static_cast<double>(cnt_in_bin) * free_bins / total_sample_cnt)));
       num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1;
       if (i == bin_upper_bound.size() - 1) {
         num_sub_bins = bins_remaining + 1;
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index d36c588164c2..f972a16d8368 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1642,6 +1642,15 @@ def test_forced_bins(self):
         est = lgb.train(params, lgb_x, num_boost_round=100)
         predicted = est.predict(new_x)
         self.assertEqual(len(np.unique(predicted)), 3)
+        params['forcedbins_filename'] = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                                     '../../examples/regression/forced_bins2.json')
+        params['max_bin'] = 11
+        lgb_x = lgb.Dataset(x[:, :1], label=y)
+        est = lgb.train(params, lgb_x, num_boost_round=100)
+        predicted = est.predict(x[1:, :1])
+        vals, counts = np.unique(predicted, return_counts=True)
+        self.assertGreaterEqual(min(counts), 9)
+        self.assertLessEqual(max(counts), 11)
 
     def test_binning_same_sign(self):
         # test that binning works properly for features with only positive or only negative values

From 9c767ae0f3701e4e02ffcfbbe3bf28afe7a10667 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Sat, 21 Sep 2019 13:54:43 +1000
Subject: [PATCH 42/49] Add warning when using categorical features with forced
 bins.

---
 include/LightGBM/dataset.h |  3 ++-
 src/io/dataset.cpp         | 27 +++++++++++++++++++++------
 src/io/dataset_loader.cpp  |  5 +++--
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 900487eafbf4..6b79ac42770c 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -596,7 +596,8 @@ class Dataset {
 
   void addFeaturesFrom(Dataset* other);
 
-  static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features);
+  static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features, 
+                                                        std::unordered_set<int> categorical_features);
 
  private:
   std::string data_filename_;
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 8048dafdec4e..6d16fefdeffc 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -329,7 +329,14 @@ void Dataset::Construct(
     max_bin_by_feature_.resize(num_total_features_);
     max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end());
   }
-  forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_);
+  // get categorical features from the bin types so that we can read the forced bin bounds
+  std::unordered_set<int> categorical_features;
+  for (int i = 0; i < num_total_features_; ++i){
+    if ((bin_mappers[i] != nullptr) && (bin_mappers[i]->bin_type() == BinType::CategoricalBin)){
+      categorical_features.insert(i);
+    }
+  }
+  forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_, categorical_features);
   max_bin_ = io_config.max_bin;
   min_data_in_bin_ = io_config.min_data_in_bin;
   bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
@@ -363,7 +370,10 @@ void Dataset::ResetConfig(const char* parameters) {
     Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
   }
   if (param.count("forcedbins_filename")) {
-    std::vector<std::vector<double>> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_);
+    /* Since the dataset is already constructed we don't know which bins are categorical.
+    Therefore read forced bins assuming no categorical features, and warn if not the same as original. */
+    std::vector<std::vector<double>> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, 
+                                                                            num_total_features_, std::unordered_set<int>());
     if (config_bounds != forced_bin_bounds_) {
       Log::Warning("Cannot change forced bins after constructed Dataset handle.");
     }
@@ -1067,7 +1077,8 @@ void Dataset::addFeaturesFrom(Dataset* other) {
 }
 
 
-std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features) {
+std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features, 
+                                                        std::unordered_set<int> categorical_features) {
   std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
   if (forced_bins_path != "") {
     std::ifstream forced_bins_stream(forced_bins_path.c_str());
@@ -1083,9 +1094,13 @@ std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_
       for (size_t i = 0; i < forced_bins_arr.size(); ++i) {
         int feature_num = forced_bins_arr[i]["feature"].int_value();
         CHECK(feature_num < num_total_features);
-        std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
-        for (size_t j = 0; j < bounds_arr.size(); ++j) {
-          forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+        if (categorical_features.count(feature_num)) {
+          Log::Warning("Feature %d is categorical. Will ignore forced bins for this feature.", feature_num);
+        } else {
+          std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+          for (size_t j = 0; j < bounds_arr.size(); ++j) {
+            forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+          }
         }
       }
       // remove duplicates
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 6e60560a9be1..005bf8082011 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -584,7 +584,7 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
 
   // get forced split
   std::string forced_bins_path = config_.forcedbins_filename;
-  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col);
+  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col, categorical_features_);
 
   const data_size_t filter_cnt = static_cast<data_size_t>(
     static_cast<double>(config_.min_data_in_leaf * total_sample_size) / num_data);
@@ -901,7 +901,8 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
 
   // get forced split
   std::string forced_bins_path = config_.forcedbins_filename;
-  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_);
+  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_, 
+                                                                              categorical_features_);
 
   // check the range of label_idx, weight_idx and group_idx
   CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_);

From cf0afd40d9f89a06ccdd68ba2caab3b7f64d4ba5 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Sat, 21 Sep 2019 14:03:15 +1000
Subject: [PATCH 43/49] Pass forced_upper_bounds by reference.

---
 src/io/bin.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 23a19273bfbf..1a083c60cff0 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -150,7 +150,7 @@ namespace LightGBM {
   }
 
   std::vector<double> FindBinWithPredefinedBin(const double* distinct_values, const int* counts,
-    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double> forced_upper_bounds) {
+    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double>& forced_upper_bounds) {
     std::vector<double> bin_upper_bound;
 
     // get list of distinct values
@@ -307,7 +307,7 @@ namespace LightGBM {
   }
 
   std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values,
-    int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double> forced_upper_bounds) {
+    int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double>& forced_upper_bounds) {
     if (forced_upper_bounds.empty()) {
       return FindBinWithZeroAsOneBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
     } else {

From 25387ec31e4d16b4f4b4072f891c668d9d4656d3 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Sat, 21 Sep 2019 15:14:40 +1000
Subject: [PATCH 44/49] Pass container types by const reference.

---
 include/LightGBM/bin.h     |  2 +-
 include/LightGBM/dataset.h |  2 +-
 src/io/bin.cpp             | 26 ++++++++++++--------------
 src/io/dataset.cpp         |  2 +-
 4 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index 1c5f62cd1907..7ea86acdd764 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -149,7 +149,7 @@ class BinMapper {
   * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
   */
   void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
-               bool use_missing, bool zero_as_missing, std::vector<double> forced_upper_bounds);
+               bool use_missing, bool zero_as_missing, const std::vector<double>& forced_upper_bounds);
 
   /*!
   * \brief Use specific number of bin to calculate the size of this class
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 6b79ac42770c..5aa0f8e21aef 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -597,7 +597,7 @@ class Dataset {
   void addFeaturesFrom(Dataset* other);
 
   static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features, 
-                                                        std::unordered_set<int> categorical_features);
+                                                        const std::unordered_set<int>& categorical_features);
 
  private:
   std::string data_filename_;
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 1a083c60cff0..94349e572f52 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -150,7 +150,7 @@ namespace LightGBM {
   }
 
   std::vector<double> FindBinWithPredefinedBin(const double* distinct_values, const int* counts,
-    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double>& forced_upper_bounds) {
+    int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, const std::vector<double>& forced_upper_bounds) {
     std::vector<double> bin_upper_bound;
 
     // get list of distinct values
@@ -204,18 +204,16 @@ namespace LightGBM {
     bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
 
     // add forced bounds, excluding zeros since we have already added zero bounds
-    size_t i = 0;
-    while (i < forced_upper_bounds.size()) {
-      if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) {
-        forced_upper_bounds.erase(forced_upper_bounds.begin() + i);
-      } else {
-        ++i;
-      }
-    }
     int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
-    int num_to_insert = std::min(max_to_insert, static_cast<int>(forced_upper_bounds.size()));
-    if (num_to_insert > 0) {
-      bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
+    int num_inserted = 0;
+    for (size_t i = 0; i < forced_upper_bounds.size(); ++i) {
+      if (std::fabs(forced_upper_bounds[i]) > kZeroThreshold) {
+        bin_upper_bound.push_back(forced_upper_bounds[i]);
+        ++num_inserted;
+      }
+      if (num_inserted >= max_to_insert) {
+        break;
+      }
     }
     std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
 
@@ -307,7 +305,7 @@ namespace LightGBM {
   }
 
   std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values,
-    int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double>& forced_upper_bounds) {
+    int max_bin, size_t total_sample_cnt, int min_data_in_bin, const std::vector<double>& forced_upper_bounds) {
     if (forced_upper_bounds.empty()) {
       return FindBinWithZeroAsOneBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
     } else {
@@ -318,7 +316,7 @@ namespace LightGBM {
 
   void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
     int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, 
-    std::vector<double> forced_upper_bounds) {
+    const std::vector<double>& forced_upper_bounds) {
     int na_cnt = 0;
     int tmp_num_sample_values = 0;
     for (int i = 0; i < num_sample_values; ++i) {
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 6d16fefdeffc..3b3af864684d 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -1078,7 +1078,7 @@ void Dataset::addFeaturesFrom(Dataset* other) {
 
 
 std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features, 
-                                                        std::unordered_set<int> categorical_features) {
+                                                        const std::unordered_set<int>& categorical_features) {
   std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
   if (forced_bins_path != "") {
     std::ifstream forced_bins_stream(forced_bins_path.c_str());

From cc249f0727c92b521c757187236e4c871cf59a85 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Mon, 23 Sep 2019 22:07:08 +1000
Subject: [PATCH 45/49] Get categorical features using FeatureBinMapper.

---
 src/io/dataset.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 3b3af864684d..5a6cc2773e67 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -370,10 +370,17 @@ void Dataset::ResetConfig(const char* parameters) {
     Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
   }
   if (param.count("forcedbins_filename")) {
-    /* Since the dataset is already constructed we don't know which bins are categorical.
-    Therefore read forced bins assuming no categorical features, and warn if not the same as original. */
+    // get categorical features from the bin types so that we can read the forced bin bounds
+    std::unordered_set<int> categorical_features;
+    for (int i = 0; i < num_total_features_; ++i) {
+      int fidx = used_feature_map_[i];
+      const BinMapper* bin_mapper = FeatureBinMapper(fidx);
+      if (bin_mapper->bin_type() == BinType::CategoricalBin) {
+        categorical_features.insert(i);
+      }
+    }
     std::vector<std::vector<double>> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, 
-                                                                            num_total_features_, std::unordered_set<int>());
+                                                                            num_total_features_, categorical_features);
     if (config_bounds != forced_bin_bounds_) {
       Log::Warning("Cannot change forced bins after constructed Dataset handle.");
     }

From 0e26e9f33afe92e638f82efb02751409d7990e1d Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Mon, 23 Sep 2019 22:11:12 +1000
Subject: [PATCH 46/49] Fix bug for small max_bin.

---
 src/io/bin.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 94349e572f52..c10d87af42a8 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -207,13 +207,13 @@ namespace LightGBM {
     int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
     int num_inserted = 0;
     for (size_t i = 0; i < forced_upper_bounds.size(); ++i) {
+      if (num_inserted >= max_to_insert) {
+        break;
+      }
       if (std::fabs(forced_upper_bounds[i]) > kZeroThreshold) {
         bin_upper_bound.push_back(forced_upper_bounds[i]);
         ++num_inserted;
       }
-      if (num_inserted >= max_to_insert) {
-        break;
-      }
     }
     std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
 

From b5752ec0a1d9887d7aa4e78720ed87d46e0dfcdf Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Fri, 27 Sep 2019 19:06:24 +1000
Subject: [PATCH 47/49] Move GetForcedBins to DatasetLoader.

---
 include/LightGBM/dataset.h        |  3 --
 include/LightGBM/dataset_loader.h |  3 ++
 src/io/dataset.cpp                | 60 ++-----------------------------
 src/io/dataset_loader.cpp         | 49 +++++++++++++++++++++++--
 4 files changed, 52 insertions(+), 63 deletions(-)

diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index cf5129ab6392..bdd693d967c5 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -596,9 +596,6 @@ class Dataset {
 
   void addFeaturesFrom(Dataset* other);
 
-  static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features, 
-                                                        const std::unordered_set<int>& categorical_features);
-
  private:
   std::string data_filename_;
   /*! \brief Store used features */
diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h
index ed4c2af93dc7..c5555ef387be 100644
--- a/include/LightGBM/dataset_loader.h
+++ b/include/LightGBM/dataset_loader.h
@@ -36,6 +36,9 @@ class DatasetLoader {
   /*! \brief Disable copy */
   DatasetLoader(const DatasetLoader&) = delete;
 
+  static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features,
+                                                        const std::unordered_set<int>& categorical_features);
+
  private:
   Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
 
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index b70e6fb19952..ea3e35fa6452 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -3,9 +3,9 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #include <LightGBM/dataset.h>
+#include <LightGBM/dataset_loader.h>
 
 #include <LightGBM/feature_group.h>
-#include <LightGBM/json11.hpp>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/threading.h>
@@ -13,12 +13,9 @@
 #include <limits>
 #include <chrono>
 #include <cstdio>
-#include <fstream>
 #include <sstream>
 #include <unordered_map>
 
-using namespace json11;
-
 
 namespace LightGBM {
 
@@ -336,7 +333,7 @@ void Dataset::Construct(
       categorical_features.insert(i);
     }
   }
-  forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_, categorical_features);
+  forced_bin_bounds_ = DatasetLoader::GetForcedBins(io_config.forcedbins_filename, num_total_features_, categorical_features);
   max_bin_ = io_config.max_bin;
   min_data_in_bin_ = io_config.min_data_in_bin;
   bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
@@ -370,20 +367,7 @@ void Dataset::ResetConfig(const char* parameters) {
     Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
   }
   if (param.count("forcedbins_filename")) {
-    // get categorical features from the bin types so that we can read the forced bin bounds
-    std::unordered_set<int> categorical_features;
-    for (int i = 0; i < num_total_features_; ++i) {
-      int fidx = used_feature_map_[i];
-      const BinMapper* bin_mapper = FeatureBinMapper(fidx);
-      if (bin_mapper->bin_type() == BinType::CategoricalBin) {
-        categorical_features.insert(i);
-      }
-    }
-    std::vector<std::vector<double>> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, 
-                                                                            num_total_features_, categorical_features);
-    if (config_bounds != forced_bin_bounds_) {
-      Log::Warning("Cannot change forced bins after constructed Dataset handle.");
-    }
+    Log::Warning("Cannot change forced bins after constructed Dataset handle.");
   }
 
   if (!io_config.monotone_constraints.empty()) {
@@ -1084,42 +1068,4 @@ void Dataset::addFeaturesFrom(Dataset* other) {
   num_groups_ += other->num_groups_;
 }
 
-
-std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features, 
-                                                        const std::unordered_set<int>& categorical_features) {
-  std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
-  if (forced_bins_path != "") {
-    std::ifstream forced_bins_stream(forced_bins_path.c_str());
-    if (forced_bins_stream.fail()) {
-      Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str());
-    } else {
-      std::stringstream buffer;
-      buffer << forced_bins_stream.rdbuf();
-      std::string err;
-      Json forced_bins_json = Json::parse(buffer.str(), err);
-      CHECK(forced_bins_json.is_array());
-      std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
-      for (size_t i = 0; i < forced_bins_arr.size(); ++i) {
-        int feature_num = forced_bins_arr[i]["feature"].int_value();
-        CHECK(feature_num < num_total_features);
-        if (categorical_features.count(feature_num)) {
-          Log::Warning("Feature %d is categorical. Will ignore forced bins for this feature.", feature_num);
-        } else {
-          std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
-          for (size_t j = 0; j < bounds_arr.size(); ++j) {
-            forced_bins[feature_num].push_back(bounds_arr[j].number_value());
-          }
-        }
-      }
-      // remove duplicates
-      for (int i = 0; i < num_total_features; ++i) {
-        auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
-        forced_bins[i].erase(new_end, forced_bins[i].end());
-      }
-    }
-  }
-  return forced_bins;
-}
-
-
 }  // namespace LightGBM
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index b930a2f7449c..476a5a7ae0b4 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -4,11 +4,16 @@
  */
 #include <LightGBM/dataset_loader.h>
 
+#include <LightGBM/json11.hpp>
 #include <LightGBM/network.h>
 #include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
+#include <fstream>
+
+using namespace json11;
+
 namespace LightGBM {
 
 DatasetLoader::DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename)
@@ -584,7 +589,7 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
 
   // get forced split
   std::string forced_bins_path = config_.forcedbins_filename;
-  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, num_col, categorical_features_);
+  std::vector<std::vector<double>> forced_bin_bounds = DatasetLoader::GetForcedBins(forced_bins_path, num_col, categorical_features_);
 
   const data_size_t filter_cnt = static_cast<data_size_t>(
     static_cast<double>(config_.min_data_in_leaf * total_sample_size) / num_data);
@@ -901,8 +906,8 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
 
   // get forced split
   std::string forced_bins_path = config_.forcedbins_filename;
-  std::vector<std::vector<double>> forced_bin_bounds = Dataset::GetForcedBins(forced_bins_path, dataset->num_total_features_, 
-                                                                              categorical_features_);
+  std::vector<std::vector<double>> forced_bin_bounds = DatasetLoader::GetForcedBins(forced_bins_path, dataset->num_total_features_, 
+                                                                                    categorical_features_);
 
   // check the range of label_idx, weight_idx and group_idx
   CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_);
@@ -1237,4 +1242,42 @@ std::string DatasetLoader::CheckCanLoadFromBin(const char* filename) {
   }
 }
 
+
+
+std::vector<std::vector<double>> DatasetLoader::GetForcedBins(std::string forced_bins_path, int num_total_features,
+                                                              const std::unordered_set<int>& categorical_features) {
+  std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
+  if (forced_bins_path != "") {
+    std::ifstream forced_bins_stream(forced_bins_path.c_str());
+    if (forced_bins_stream.fail()) {
+      Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str());
+    } else {
+      std::stringstream buffer;
+      buffer << forced_bins_stream.rdbuf();
+      std::string err;
+      Json forced_bins_json = Json::parse(buffer.str(), err);
+      CHECK(forced_bins_json.is_array());
+      std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
+      for (size_t i = 0; i < forced_bins_arr.size(); ++i) {
+        int feature_num = forced_bins_arr[i]["feature"].int_value();
+        CHECK(feature_num < num_total_features);
+        if (categorical_features.count(feature_num)) {
+          Log::Warning("Feature %d is categorical. Will ignore forced bins for this  feature.", feature_num);
+        } else {
+          std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
+          for (size_t j = 0; j < bounds_arr.size(); ++j) {
+            forced_bins[feature_num].push_back(bounds_arr[j].number_value());
+          }
+        }
+      }
+      // remove duplicates
+      for (int i = 0; i < num_total_features; ++i) {
+        auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
+        forced_bins[i].erase(new_end, forced_bins[i].end());
+      }
+    }
+  }
+  return forced_bins;
+}
+
 }  // namespace LightGBM

From 58d86aa6bfe1659b75853cc45d30673c345c0306 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Sat, 28 Sep 2019 13:24:23 +1000
Subject: [PATCH 48/49] Find forced bins in dataset_loader.

---
 include/LightGBM/dataset.h |  1 +
 src/io/dataset.cpp         | 11 ++---------
 src/io/dataset_loader.cpp  |  4 ++--
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index bdd693d967c5..dd52571efbc1 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -290,6 +290,7 @@ class Dataset {
 
   void Construct(
     std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
+    std::vector<std::vector<double>>& forced_bins,
     int** sample_non_zero_indices,
     const int* num_per_col,
     size_t total_sample_cnt,
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index ea3e35fa6452..7d74b4fa3f5b 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -3,7 +3,6 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #include <LightGBM/dataset.h>
-#include <LightGBM/dataset_loader.h>
 
 #include <LightGBM/feature_group.h>
 #include <LightGBM/utils/array_args.h>
@@ -216,6 +215,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
 
 void Dataset::Construct(
   std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
+  std::vector<std::vector<double>>& forced_bins,
   int** sample_non_zero_indices,
   const int* num_per_col,
   size_t total_sample_cnt,
@@ -326,14 +326,7 @@ void Dataset::Construct(
     max_bin_by_feature_.resize(num_total_features_);
     max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end());
   }
-  // get categorical features from the bin types so that we can read the forced bin bounds
-  std::unordered_set<int> categorical_features;
-  for (int i = 0; i < num_total_features_; ++i){
-    if ((bin_mappers->at(i) != nullptr) && (bin_mappers->at(i)->bin_type() == BinType::CategoricalBin)){
-      categorical_features.insert(i);
-    }
-  }
-  forced_bin_bounds_ = DatasetLoader::GetForcedBins(io_config.forcedbins_filename, num_total_features_, categorical_features);
+  forced_bin_bounds_ = forced_bins;
   max_bin_ = io_config.max_bin;
   min_data_in_bin_ = io_config.min_data_in_bin;
   bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 476a5a7ae0b4..da0f6b9dfc32 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -720,7 +720,7 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
     }
   }
   auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
-  dataset->Construct(&bin_mappers, sample_indices, num_per_col, total_sample_size, config_);
+  dataset->Construct(&bin_mappers, forced_bin_bounds, sample_indices, num_per_col, total_sample_size, config_);
   dataset->set_feature_names(feature_names_);
   return dataset.release();
 }
@@ -1053,7 +1053,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
     }
   }
   sample_values.clear();
-  dataset->Construct(&bin_mappers, Common::Vector2Ptr<int>(&sample_indices).data(),
+  dataset->Construct(&bin_mappers, forced_bin_bounds, Common::Vector2Ptr<int>(&sample_indices).data(),
                      Common::VectorSize<int>(sample_indices).data(), sample_data.size(), config_);
 }
 

From 3e81b944d88934e7e639a1af68acc2cf6abeacc7 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Sat, 28 Sep 2019 22:15:51 +1000
Subject: [PATCH 49/49] Minor fixes.

---
 include/LightGBM/dataset.h | 2 +-
 src/io/dataset.cpp         | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index dd52571efbc1..3d0ae990201d 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -290,7 +290,7 @@ class Dataset {
 
   void Construct(
     std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
-    std::vector<std::vector<double>>& forced_bins,
+    const std::vector<std::vector<double>>& forced_bins,
     int** sample_non_zero_indices,
     const int* num_per_col,
     size_t total_sample_cnt,
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 7d74b4fa3f5b..54c8fcc22481 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -215,7 +215,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
 
 void Dataset::Construct(
   std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
-  std::vector<std::vector<double>>& forced_bins,
+  const std::vector<std::vector<double>>& forced_bins,
   int** sample_non_zero_indices,
   const int* num_per_col,
   size_t total_sample_cnt,
@@ -436,6 +436,7 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
   group_feature_cnt_ = dataset->group_feature_cnt_;
   monotone_types_ = dataset->monotone_types_;
   feature_penalty_ = dataset->feature_penalty_;
+  forced_bin_bounds_ = dataset->forced_bin_bounds_;
 }
 
 void Dataset::CreateValid(const Dataset* dataset) {
@@ -490,6 +491,7 @@ void Dataset::CreateValid(const Dataset* dataset) {
   }
   monotone_types_ = dataset->monotone_types_;
   feature_penalty_ = dataset->feature_penalty_;
+  forced_bin_bounds_ = dataset->forced_bin_bounds_;
 }
 
 void Dataset::ReSize(data_size_t num_data) {