Pr4 advanced method monotone constraints (#3264)

* No need to pass the tree to all fuctions related to monotone constraints because the pointer is shared. * Fix OppositeChildShouldBeUpdated numerical split optimisation. * No need to use constraints when computing the output of the root. * Refactor existing constraints. * Add advanced constraints method. * Update tests. * Add override. * linting. * Add override. * Simplify condition in LeftRightContainsRelevantInformation. * Add virtual destructor to FeatureConstraint. * Remove redundant blank line. * linting of else. * Indentation. * Lint else. * Replaced non-const reference by pointers. * Forgotten reference. * Leverage USE_MC for efficiency. * Make constraints const again in feature_histogram.hpp. * Update docs. * Add "advanced" to the monotone constraints options. * Update monotone constraints restrictions. * Fix loop iterator. Co-authored-by: Nikita Titov <[email protected]> * Fix loop iterator. Co-authored-by: Nikita Titov <[email protected]> * Remove superfluous parenthesis. * Fix loop iterator. Co-authored-by: Nikita Titov <[email protected]> * Fix loop iterator. Co-authored-by: Nikita Titov <[email protected]> * Fix loop iterator. Co-authored-by: Nikita Titov <[email protected]> * Fix loop iterator. Co-authored-by: Nikita Titov <[email protected]> * Fix loop iterator. Co-authored-by: Nikita Titov <[email protected]> * Fix loop iterator. Co-authored-by: Nikita Titov <[email protected]> * Fix loop iterator. Co-authored-by: Nikita Titov <[email protected]> * Fix loop iterator. Co-authored-by: Nikita Titov <[email protected]> * Remove std namespace qualifier. * Fix unsigned_int size_t comparison. * Set num_features as int for consistency with the rest of the codebase. * Make sure constraints exist before recomputing them. * Initialize previous constraints in UpdateConstraints. * Update monotone constraints restrictions. * Refactor UpdateConstraints loop. * Update src/io/config.cpp Co-authored-by: Nikita Titov <[email protected]> * Delete white spaces. Co-authored-by: Charles Auguste <[email protected]> Co-authored-by: Nikita Titov <[email protected]>
microsoft · Sep 21, 2020 · 4278f22 · 4278f22
1 parent 3454698
commit 4278f22
Show file tree

Hide file tree

Showing 7 changed files with 842 additions and 99 deletions.
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
@@ -462,7 +462,7 @@ Learning Control Parameters
 
    -  you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature
 
--  ``monotone_constraints_method`` :raw-html:`<a id="monotone_constraints_method" title="Permalink to this parameter" href="#monotone_constraints_method">&#x1F517;&#xFE0E;</a>`, default = ``basic``, type = enum, options: ``basic``, ``intermediate``, aliases: ``monotone_constraining_method``, ``mc_method``
+-  ``monotone_constraints_method`` :raw-html:`<a id="monotone_constraints_method" title="Permalink to this parameter" href="#monotone_constraints_method">&#x1F517;&#xFE0E;</a>`, default = ``basic``, type = enum, options: ``basic``, ``intermediate``, ``advanced``, aliases: ``monotone_constraining_method``, ``mc_method``
 
    -  used only if ``monotone_constraints`` is set
 
@@ -472,6 +472,8 @@ Learning Control Parameters
 
       -  ``intermediate``, a `more advanced method <https://github.com/microsoft/LightGBM/files/3457826/PR-monotone-constraints-report.pdf>`__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results
 
+      -  ``advanced``, an `even more advanced method <https://github.com/microsoft/LightGBM/files/3457826/PR-monotone-constraints-report.pdf>`__, which may slow the library. However, this method is even less constraining than the intermediate method and should again significantly improve the results
+
 -  ``monotone_penalty`` :raw-html:`<a id="monotone_penalty" title="Permalink to this parameter" href="#monotone_penalty">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, aliases: ``monotone_splits_penalty``, ``ms_penalty``, ``mc_penalty``, constraints: ``monotone_penalty >= 0.0``
 
    -  used only if ``monotone_constraints`` is set

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
@@ -443,11 +443,12 @@ struct Config {
 
   // type = enum
   // alias = monotone_constraining_method, mc_method
-  // options = basic, intermediate
+  // options = basic, intermediate, advanced
   // desc = used only if ``monotone_constraints`` is set
   // desc = monotone constraints method
   // descl2 = ``basic``, the most basic monotone constraints method. It does not slow the library at all, but over-constrains the predictions
   // descl2 = ``intermediate``, a `more advanced method <https://github.com/microsoft/LightGBM/files/3457826/PR-monotone-constraints-report.pdf>`__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results
+  // descl2 = ``advanced``, an `even more advanced method <https://github.com/microsoft/LightGBM/files/3457826/PR-monotone-constraints-report.pdf>`__, which may slow the library. However, this method is even less constraining than the intermediate method and should again significantly improve the results
   std::string monotone_constraints_method = "basic";
 
   // alias = monotone_splits_penalty, ms_penalty, mc_penalty

diff --git a/src/io/config.cpp b/src/io/config.cpp
@@ -345,15 +345,15 @@ void Config::CheckParamConflict() {
     min_data_in_leaf = 2;
     Log::Warning("min_data_in_leaf has been increased to 2 because this is required when path smoothing is active.");
   }
-  if (is_parallel && monotone_constraints_method == std::string("intermediate")) {
+  if (is_parallel && (monotone_constraints_method == std::string("intermediate") || monotone_constraints_method == std::string("advanced"))) {
     // In distributed mode, local node doesn't have histograms on all features, cannot perform "intermediate" monotone constraints.
-    Log::Warning("Cannot use \"intermediate\" monotone constraints in parallel learning, auto set to \"basic\" method.");
+    Log::Warning("Cannot use \"intermediate\" or \"advanced\" monotone constraints in parallel learning, auto set to \"basic\" method.");
     monotone_constraints_method = "basic";
   }
-  if (feature_fraction_bynode != 1.0 && monotone_constraints_method == std::string("intermediate")) {
+  if (feature_fraction_bynode != 1.0 && (monotone_constraints_method == std::string("intermediate") || monotone_constraints_method == std::string("advanced"))) {
     // "intermediate" monotone constraints need to recompute splits. If the features are sampled when computing the
     // split initially, then the sampling needs to be recorded or done once again, which is currently not supported
-    Log::Warning("Cannot use \"intermediate\" monotone constraints with feature fraction different from 1, auto set monotone constraints to \"basic\" method.");
+    Log::Warning("Cannot use \"intermediate\" or \"advanced\" monotone constraints with feature fraction different from 1, auto set monotone constraints to \"basic\" method.");
     monotone_constraints_method = "basic";
   }
   if (max_depth > 0 && monotone_penalty >= max_depth) {

diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
@@ -84,7 +84,7 @@ class FeatureHistogram {
 
   void FindBestThreshold(double sum_gradient, double sum_hessian,
                          data_size_t num_data,
-                         const ConstraintEntry& constraints,
+                         const FeatureConstraint* constraints,
                          double parent_output,
                          SplitInfo* output) {
     output->default_left = true;
@@ -158,7 +158,7 @@ class FeatureHistogram {
 #define TEMPLATE_PREFIX USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING
 #define LAMBDA_ARGUMENTS                                         \
   double sum_gradient, double sum_hessian, data_size_t num_data, \
-      const ConstraintEntry &constraints, double parent_output, SplitInfo *output
+      const FeatureConstraint* constraints, double parent_output, SplitInfo *output
 #define BEFORE_ARGUMENTS sum_gradient, sum_hessian, parent_output, num_data, output, &rand_threshold
 #define FUNC_ARGUMENTS                                                      \
   sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, \
@@ -278,7 +278,7 @@ class FeatureHistogram {
   void FindBestThresholdCategoricalInner(double sum_gradient,
                                          double sum_hessian,
                                          data_size_t num_data,
-                                         const ConstraintEntry& constraints,
+                                         const FeatureConstraint* constraints,
                                          double parent_output,
                                          SplitInfo* output) {
     is_splittable_ = false;
@@ -288,6 +288,9 @@ class FeatureHistogram {
     double best_sum_left_gradient = 0;
     double best_sum_left_hessian = 0;
     double gain_shift;
+    if (USE_MC) {
+      constraints->InitCumulativeConstraints(true);
+    }
     if (USE_SMOOTHING) {
       gain_shift = GetLeafGainGivenOutput<USE_L1>(
           sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output);
@@ -474,14 +477,14 @@ class FeatureHistogram {
       output->left_output = CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
           best_sum_left_gradient, best_sum_left_hessian,
           meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-          constraints, meta_->config->path_smooth, best_left_count, parent_output);
+          constraints->LeftToBasicConstraint(), meta_->config->path_smooth, best_left_count, parent_output);
       output->left_count = best_left_count;
       output->left_sum_gradient = best_sum_left_gradient;
       output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
       output->right_output = CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
           sum_gradient - best_sum_left_gradient,
           sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2,
-          meta_->config->max_delta_step, constraints, meta_->config->path_smooth,
+          meta_->config->max_delta_step, constraints->RightToBasicConstraint(), meta_->config->path_smooth,
           num_data - best_left_count, parent_output);
       output->right_count = num_data - best_left_count;
       output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
@@ -763,7 +766,7 @@ class FeatureHistogram {
   template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
   static double CalculateSplittedLeafOutput(
       double sum_gradients, double sum_hessians, double l1, double l2,
-      double max_delta_step, const ConstraintEntry& constraints,
+      double max_delta_step, const BasicConstraint& constraints,
       double smoothing, data_size_t num_data, double parent_output) {
     double ret = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
         sum_gradients, sum_hessians, l1, l2, max_delta_step, smoothing, num_data, parent_output);
@@ -784,7 +787,7 @@ class FeatureHistogram {
                               double sum_right_gradients,
                               double sum_right_hessians, double l1, double l2,
                               double max_delta_step,
-                              const ConstraintEntry& constraints,
+                              const FeatureConstraint* constraints,
                               int8_t monotone_constraint,
                               double smoothing,
                               data_size_t left_count,
@@ -803,11 +806,11 @@ class FeatureHistogram {
       double left_output =
           CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
               sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step,
-              constraints, smoothing, left_count, parent_output);
+              constraints->LeftToBasicConstraint(), smoothing, left_count, parent_output);
       double right_output =
           CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
               sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step,
-              constraints, smoothing, right_count, parent_output);
+              constraints->RightToBasicConstraint(), smoothing, right_count, parent_output);
       if (((monotone_constraint > 0) && (left_output > right_output)) ||
           ((monotone_constraint < 0) && (left_output < right_output))) {
         return 0;
@@ -854,7 +857,7 @@ class FeatureHistogram {
             bool REVERSE, bool SKIP_DEFAULT_BIN, bool NA_AS_MISSING>
   void FindBestThresholdSequentially(double sum_gradient, double sum_hessian,
                                      data_size_t num_data,
-                                     const ConstraintEntry& constraints,
+                                     const FeatureConstraint* constraints,
                                      double min_gain_shift, SplitInfo* output,
                                      int rand_threshold, double parent_output) {
     const int8_t offset = meta_->offset;
@@ -864,6 +867,16 @@ class FeatureHistogram {
     data_size_t best_left_count = 0;
     uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
     const double cnt_factor = num_data / sum_hessian;
+
+    BasicConstraint best_right_constraints;
+    BasicConstraint best_left_constraints;
+    bool constraint_update_necessary =
+        USE_MC && constraints->ConstraintDifferentDependingOnThreshold();
+
+    if (USE_MC) {
+      constraints->InitCumulativeConstraints(REVERSE);
+    }
+
     if (REVERSE) {
       double sum_right_gradient = 0.0f;
       double sum_right_hessian = kEpsilon;
@@ -910,6 +923,11 @@ class FeatureHistogram {
             continue;
           }
         }
+
+        if (USE_MC && constraint_update_necessary) {
+          constraints->Update(t + offset);
+        }
+
         // current split gain
         double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
             sum_left_gradient, sum_left_hessian, sum_right_gradient,
@@ -932,6 +950,10 @@ class FeatureHistogram {
           // left is <= threshold, right is > threshold.  so this is t-1
           best_threshold = static_cast<uint32_t>(t - 1 + offset);
           best_gain = current_gain;
+          if (USE_MC) {
+            best_right_constraints = constraints->RightToBasicConstraint();
+            best_left_constraints = constraints->LeftToBasicConstraint();
+          }
         }
       }
     } else {
@@ -1016,6 +1038,10 @@ class FeatureHistogram {
           best_sum_left_hessian = sum_left_hessian;
           best_threshold = static_cast<uint32_t>(t + offset);
           best_gain = current_gain;
+          if (USE_MC) {
+            best_right_constraints = constraints->RightToBasicConstraint();
+            best_left_constraints = constraints->LeftToBasicConstraint();
+          }
         }
       }
     }
@@ -1027,7 +1053,7 @@ class FeatureHistogram {
           CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
               best_sum_left_gradient, best_sum_left_hessian,
               meta_->config->lambda_l1, meta_->config->lambda_l2,
-              meta_->config->max_delta_step, constraints, meta_->config->path_smooth,
+              meta_->config->max_delta_step, best_left_constraints, meta_->config->path_smooth,
               best_left_count, parent_output);
       output->left_count = best_left_count;
       output->left_sum_gradient = best_sum_left_gradient;
@@ -1037,7 +1063,7 @@ class FeatureHistogram {
               sum_gradient - best_sum_left_gradient,
               sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1,
               meta_->config->lambda_l2, meta_->config->max_delta_step,
-              constraints, meta_->config->path_smooth, num_data - best_left_count,
+              best_right_constraints, meta_->config->path_smooth, num_data - best_left_count,
               parent_output);
       output->right_count = num_data - best_left_count;
       output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
@@ -1053,7 +1079,7 @@ class FeatureHistogram {
   hist_t* data_;
   bool is_splittable_ = true;
 
-  std::function<void(double, double, data_size_t, const ConstraintEntry&,
+  std::function<void(double, double, data_size_t, const FeatureConstraint*,
                      double, SplitInfo*)>
       find_best_threshold_fun_;
 };