From 1602179b24b0fdaff3c6e0420c65f41167bb9c4d Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 17 Feb 2020 15:53:34 +0000 Subject: [PATCH 01/47] Add util functions. --- include/LightGBM/tree.h | 74 +++++++++++++++++++++++++++++++++++++++-- src/io/tree.cpp | 10 ++++-- 2 files changed, 78 insertions(+), 6 deletions(-) diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 9e72e056ba62..38390f8b329d 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -60,7 +60,7 @@ class Tree { int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, double threshold_double, double left_value, double right_value, int left_cnt, int right_cnt, double left_weight, double right_weight, - float gain, MissingType missing_type, bool default_left); + float gain, MissingType missing_type, bool default_left, bool feature_is_monotone); /*! * \brief Performing a split on tree leaves, with categorical feature @@ -134,6 +134,28 @@ class Tree { inline int PredictLeafIndex(const double* feature_values) const; inline int PredictLeafIndexByMap(const std::unordered_map& feature_values) const; +<<<<<<< HEAD +======= + // Get node parent + inline int node_parent(int node_idx) const; + // Get leaf parent + inline int leaf_parent(int node_idx) const; + + // Get children + inline int left_child(int node_idx) const; + inline int right_child(int node_idx) const; + + // Get if the feature is in a monotone subtree + inline bool leaf_is_in_monotone_subtree(int leaf_idx) const; + + inline double internal_value(int node_idx) const; + + inline uint32_t threshold_in_bin(int node_idx) const; + + // Get the feature corresponding to the split + inline int split_feature_inner(int node_idx) const; + +>>>>>>> Add util functions. inline void PredictContrib(const double* feature_values, int num_features, double* output); /*! \brief Get Number of leaves*/ @@ -326,7 +348,7 @@ class Tree { } inline void Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt, - double left_weight, double right_weight, float gain); + double left_weight, double right_weight, float gain, bool feature_is_monotone); /*! * \brief Find leaf index of which record belongs by features * \param feature_values Feature value of this record @@ -425,12 +447,22 @@ class Tree { std::vector leaf_depth_; double shrinkage_; int max_depth_; + // add parent node information + std::vector node_parent_; + // Keeps track of the monotone splits above the leaf + std::vector leaf_is_in_monotone_subtree_; }; inline void Tree::Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt, - double left_weight, double right_weight, float gain) { + double left_weight, double right_weight, float gain, bool feature_is_monotone) { int new_node_idx = num_leaves_ - 1; + + // Update if there is a monotone split above the leaf + if (feature_is_monotone || leaf_is_in_monotone_subtree_[leaf]) { + leaf_is_in_monotone_subtree_[leaf] = true; + leaf_is_in_monotone_subtree_[num_leaves_] = true; + } // update parent info int parent = leaf_parent_[leaf]; if (parent >= 0) { @@ -444,6 +476,7 @@ inline void Tree::Split(int leaf, int feature, int real_feature, // add new node split_feature_inner_[new_node_idx] = feature; split_feature_[new_node_idx] = real_feature; + node_parent_[new_node_idx] = parent; split_gain_[new_node_idx] = gain; // add two new leaves @@ -552,6 +585,41 @@ inline int Tree::GetLeafByMap(const std::unordered_map& feature_val return ~node; } +inline int Tree::node_parent(int node_idx) const{ + return node_parent_[node_idx]; +} + +inline int Tree::left_child(int node_idx) const{ + return left_child_[node_idx]; +} + +inline int Tree::right_child(int node_idx) const{ + return right_child_[node_idx]; +} + +inline int Tree::split_feature_inner(int node_idx) const{ + return split_feature_inner_[node_idx]; +} + +inline int Tree::leaf_parent(int node_idx) const{ + return leaf_parent_[node_idx]; +} + +inline uint32_t Tree::threshold_in_bin(int node_idx) const{ + #ifdef DEBUG + CHECK(node_idx >= 0); + #endif + return threshold_in_bin_[node_idx]; +} + +inline bool Tree::leaf_is_in_monotone_subtree(int leaf_idx) const { + return leaf_is_in_monotone_subtree_[leaf_idx]; +} + +inline double Tree::internal_value(int node_idx) const { + return internal_value_[node_idx]; +} + } // namespace LightGBM diff --git a/src/io/tree.cpp b/src/io/tree.cpp index affc7080472f..42519ffb727a 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -24,7 +24,9 @@ Tree::Tree(int max_leaves) threshold_.resize(max_leaves_ - 1); decision_type_.resize(max_leaves_ - 1, 0); split_gain_.resize(max_leaves_ - 1); + node_parent_.resize(max_leaves_ - 1); leaf_parent_.resize(max_leaves_); + leaf_is_in_monotone_subtree_.resize(max_leaves_); leaf_value_.resize(max_leaves_); leaf_weight_.resize(max_leaves_); leaf_count_.resize(max_leaves_); @@ -38,6 +40,7 @@ Tree::Tree(int max_leaves) leaf_value_[0] = 0.0f; leaf_weight_[0] = 0.0f; leaf_parent_[0] = -1; + node_parent_.resize(max_leaves_ - 1); shrinkage_ = 1.0f; num_cat_ = 0; cat_boundaries_.push_back(0); @@ -50,8 +53,9 @@ Tree::~Tree() { int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, double threshold_double, double left_value, double right_value, - int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type, bool default_left) { - Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain); + int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, + MissingType missing_type, bool default_left, bool feature_was_monotone) { + Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain, feature_was_monotone); int new_node_idx = num_leaves_ - 1; decision_type_[new_node_idx] = 0; SetDecisionType(&decision_type_[new_node_idx], false, kCategoricalMask); @@ -72,7 +76,7 @@ int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin, const uint32_t* threshold, int num_threshold, double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type) { - Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain); + Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain, false); int new_node_idx = num_leaves_ - 1; decision_type_[new_node_idx] = 0; SetDecisionType(&decision_type_[new_node_idx], true, kCategoricalMask); From 80c434ed1ff9e5bdffad2af3f5e7eb9c1ed6f439 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 18 Feb 2020 13:49:53 +0000 Subject: [PATCH 02/47] Added monotone_constraints_method as a parameter. --- docs/Parameters.rst | 7 +++++++ include/LightGBM/config.h | 5 +++++ src/io/config_auto.cpp | 5 +++++ src/treelearner/monotone_constraints.cpp | 0 4 files changed, 17 insertions(+) create mode 100644 src/treelearner/monotone_constraints.cpp diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 573602c47f61..0625998f1820 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -376,6 +376,13 @@ Learning Control Parameters - dropout rate: a fraction of previous trees to drop during the dropout +- ``monotone_constraints_method`` :raw-html:`🔗︎`, default = ``basic``, type = string, aliases: ``monotone_constraining_method``, constraints: ``0.0 <= monotone_penalty (< max_depth, if max_depth > 0)`` + + - used only if ``monotone_constraints`` is set + + - monotone constraints method: if set to "basic", the most basic monotone constraints method will be used. It does not slow the library at all, but over-constrains the predictions. If set to "intermediate", a more advanced method will be used, which may very slightly slow the library. However, the intermediate method is much less constraining than the basic method and should significantly improve the results. + + - ``max_drop`` :raw-html:`🔗︎`, default = ``50``, type = int - used only in ``dart`` diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index dae4a3f9c1fe..b2cab634e6a7 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -364,6 +364,11 @@ struct Config { // desc = dropout rate: a fraction of previous trees to drop during the dropout double drop_rate = 0.1; + // alias = monotone_constraining_method + // desc = used only if ``monotone_constraints`` is set + // desc = monotone constraints method: if set to "basic", the most basic monotone constraints method will be used. It does not slow the library at all, but over-constrains the predictions. If set to "intermediate", a more advanced method will be used, which may slow the library very slightly. However, the intermediate method is much less constraining than the basic method and should significantly improve the results. + std::string monotone_constraints_method = "basic"; + // desc = used only in ``dart`` // desc = max number of dropped trees during one boosting iteration // desc = ``<=0`` means no limit diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index c4537a67656b..9ec36f3efbae 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -82,6 +82,7 @@ const std::unordered_map& Config::alias_table() { {"lambda", "lambda_l2"}, {"min_split_gain", "min_gain_to_split"}, {"rate_drop", "drop_rate"}, + {"monotone_constraining_method", "monotone_constraints_method"}, {"topk", "top_k"}, {"mc", "monotone_constraints"}, {"monotone_constraint", "monotone_constraints"}, @@ -201,6 +202,7 @@ const std::unordered_set& Config::parameter_set() { "lambda_l2", "min_gain_to_split", "drop_rate", + "monotone_constraints_method", "max_drop", "skip_drop", "xgboost_dart_mode", @@ -372,6 +374,8 @@ void Config::GetMembersFromString(const std::unordered_map Date: Tue, 18 Feb 2020 14:05:38 +0000 Subject: [PATCH 03/47] Add the intermediate constraining method. --- src/treelearner/leaf_splits.hpp | 13 ++ src/treelearner/monotone_constraints.cpp | 0 src/treelearner/monotone_constraints.hpp | 263 ++++++++++++++++++++++- src/treelearner/serial_tree_learner.cpp | 89 +++++++- src/treelearner/serial_tree_learner.h | 7 + 5 files changed, 367 insertions(+), 5 deletions(-) delete mode 100644 src/treelearner/monotone_constraints.cpp diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index 84323c3349b5..93c9f71d627f 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -45,6 +45,19 @@ class LeafSplits { sum_hessians_ = sum_hessians; } + /*! + + * \brief Init split on current leaf on partial data. + * \param leaf Index of current leaf + * \param sum_gradients + * \param sum_hessians + */ + void Init(int leaf, double sum_gradients, double sum_hessians) { + leaf_index_ = leaf; + sum_gradients_ = sum_gradients; + sum_hessians_ = sum_hessians; + } + /*! * \brief Init splits on current leaf, it will traverse all data to sum up the results * \param gradients diff --git a/src/treelearner/monotone_constraints.cpp b/src/treelearner/monotone_constraints.cpp deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 88658208e94e..dfb8728003a1 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -9,9 +9,24 @@ #include #include #include +#include "split_info.hpp" namespace LightGBM { +struct CostEfficientGradientBoosting; + +struct LearnerState { + const Config *config_; + const Dataset *train_data_; + const Tree *tree; + std::unique_ptr &cegb_; + + LearnerState(const Config *config_, const Dataset *train_data_, + const Tree *tree, + std::unique_ptr &cegb_) + : config_(config_), train_data_(train_data_), tree(tree), cegb_(cegb_) {}; +}; + struct ConstraintEntry { double min = -std::numeric_limits::max(); double max = std::numeric_limits::max(); @@ -26,6 +41,22 @@ struct ConstraintEntry { void UpdateMin(double new_min) { min = std::max(new_min, min); } void UpdateMax(double new_max) { max = std::min(new_max, max); } + + bool UpdateMinAndReturnBoolIfChanged(double new_min) { + if (new_min > min) { + min = new_min; + return true; + } + return false; + } + + bool UpdateMaxAndReturnBoolIfChanged(double new_max) { + if (new_max < max) { + max = new_max; + return true; + } + return false; + } }; template @@ -33,6 +64,7 @@ class LeafConstraints { public: explicit LeafConstraints(int num_leaves) : num_leaves_(num_leaves) { entries_.resize(num_leaves_); + leaves_to_update.reserve(num_leaves_); } void Reset() { @@ -41,7 +73,7 @@ class LeafConstraints { } } - void UpdateConstraints(bool is_numerical_split, int leaf, int new_leaf, + void UpdateConstraintsWithMid(bool is_numerical_split, int leaf, int new_leaf, int8_t monotone_type, double right_output, double left_output) { entries_[new_leaf] = entries_[leaf]; @@ -57,8 +89,237 @@ class LeafConstraints { } } + void UpdateConstraintsWithOutputs(bool is_numerical_split, int leaf, + int new_leaf, int8_t monotone_type, + double right_output, double left_output) { + entries_[new_leaf] = entries_[leaf]; + if (is_numerical_split) { + if (monotone_type < 0) { + entries_[leaf].UpdateMin(right_output); + entries_[new_leaf].UpdateMax(left_output); + } else if (monotone_type > 0) { + entries_[leaf].UpdateMax(right_output); + entries_[new_leaf].UpdateMin(left_output); + } + } + } + + void GoUpToFindLeavesToUpdate(int node_idx, std::vector &features, + std::vector &thresholds, + std::vector &is_in_right_split, + int split_feature, const SplitInfo &split_info, + uint32_t split_threshold, + std::vector &best_split_per_leaf_, + LearnerState &learner_state) { + + int parent_idx = learner_state.tree->node_parent(node_idx); + if (parent_idx != -1) { + int inner_feature = learner_state.tree->split_feature_inner(parent_idx); + int8_t monotone_type = + learner_state.train_data_->FeatureMonotone(inner_feature); + bool is_right_split = + learner_state.tree->right_child(parent_idx) == node_idx; + bool split_contains_new_information = true; + bool is_split_numerical = + learner_state.train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; + + // only branches containing leaves that are contiguous to the original + // leaf need to be updated + for (unsigned int i = 0; i < features.size(); ++i) { + if ((features[i] == inner_feature && is_split_numerical) && + (is_in_right_split[i] == is_right_split)) { + split_contains_new_information = false; + break; + } + } + + if (split_contains_new_information) { + if (monotone_type != 0) { + int left_child_idx = learner_state.tree->left_child(parent_idx); + int right_child_idx = learner_state.tree->right_child(parent_idx); + bool left_child_is_curr_idx = (left_child_idx == node_idx); + int node_idx_to_pass = + (left_child_is_curr_idx) ? right_child_idx : left_child_idx; + bool take_min = (monotone_type < 0) ? left_child_is_curr_idx + : !left_child_is_curr_idx; + + GoDownToFindLeavesToUpdate( + node_idx_to_pass, features, thresholds, is_in_right_split, + take_min, split_feature, split_info, true, true, split_threshold, + best_split_per_leaf_, + learner_state); + } + + is_in_right_split.push_back( + learner_state.tree->right_child(parent_idx) == node_idx); + thresholds.push_back(learner_state.tree->threshold_in_bin(parent_idx)); + features.push_back(learner_state.tree->split_feature_inner(parent_idx)); + } + + if (parent_idx != 0) { + LeafConstraints::GoUpToFindLeavesToUpdate( + parent_idx, features, thresholds, is_in_right_split, split_feature, + split_info, split_threshold, best_split_per_leaf_, + learner_state); + } + } + } + + void GoDownToFindLeavesToUpdate( + int node_idx, const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split, int maximum, + int split_feature, const SplitInfo &split_info, bool use_left_leaf, + bool use_right_leaf, uint32_t split_threshold, + std::vector &best_split_per_leaf_, + LearnerState &learner_state) { + + if (node_idx < 0) { + int leaf_idx = ~node_idx; + + // if a leaf is at max depth then there is no need to update it + int max_depth = learner_state.config_->max_depth; + if (learner_state.tree->leaf_depth(leaf_idx) >= max_depth && + max_depth > 0) { + return; + } + + // splits that are not to be used shall not be updated + if (best_split_per_leaf_[leaf_idx].gain == kMinScore) { + return; + } + + std::pair min_max_constraints; + bool something_changed; + if (use_right_leaf && use_left_leaf) { + min_max_constraints = + std::minmax(split_info.right_output, split_info.left_output); + } else if (use_right_leaf && !use_left_leaf) { + min_max_constraints = std::pair( + split_info.right_output, split_info.right_output); + } else { + min_max_constraints = std::pair(split_info.left_output, + split_info.left_output); + } + +#ifdef DEBUG + if (maximum) { + CHECK(min_max_constraints.first >= + learner_state.tree->LeafOutput(leaf_idx)); + } else { + CHECK(min_max_constraints.second <= + learner_state.tree->LeafOutput(leaf_idx)); + } +#endif + + if (!maximum) { + something_changed = entries_[leaf_idx].UpdateMinAndReturnBoolIfChanged( + min_max_constraints.second); + } else { + something_changed = entries_[leaf_idx].UpdateMaxAndReturnBoolIfChanged( + min_max_constraints.first); + } + if (!something_changed) { + return; + } + leaves_to_update.push_back(leaf_idx); + + } else { + // check if the children are contiguous with the original leaf + std::pair keep_going_left_right = ShouldKeepGoingLeftRight( + learner_state.tree, node_idx, features, thresholds, is_in_right_split, + learner_state.train_data_); + int inner_feature = learner_state.tree->split_feature_inner(node_idx); + uint32_t threshold = learner_state.tree->threshold_in_bin(node_idx); + bool is_split_numerical = + learner_state.train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; + bool use_left_leaf_for_update = true; + bool use_right_leaf_for_update = true; + if (is_split_numerical && inner_feature == split_feature) { + if (threshold >= split_threshold) { + use_left_leaf_for_update = false; + } + if (threshold <= split_threshold) { + use_right_leaf_for_update = false; + } + } + + if (keep_going_left_right.first) { + GoDownToFindLeavesToUpdate( + learner_state.tree->left_child(node_idx), features, thresholds, + is_in_right_split, maximum, split_feature, split_info, + use_left_leaf, use_right_leaf_for_update && use_right_leaf, + split_threshold, best_split_per_leaf_, + learner_state); + } + if (keep_going_left_right.second) { + GoDownToFindLeavesToUpdate( + learner_state.tree->right_child(node_idx), features, thresholds, + is_in_right_split, maximum, split_feature, split_info, + use_left_leaf_for_update && use_left_leaf, use_right_leaf, + split_threshold, best_split_per_leaf_, + learner_state); + } + } + } + + void GoUpToFindLeavesToUpdate(int node_idx, int split_feature, + const SplitInfo &split_info, + uint32_t split_threshold, + std::vector &best_split_per_leaf_, + LearnerState &learner_state) { + int depth = learner_state.tree->leaf_depth( + ~learner_state.tree->left_child(node_idx)) - + 1; + + std::vector features; + std::vector thresholds; + std::vector is_in_right_split; + + features.reserve(depth); + thresholds.reserve(depth); + is_in_right_split.reserve(depth); + + GoUpToFindLeavesToUpdate( + node_idx, features, thresholds, is_in_right_split, split_feature, + split_info, split_threshold, best_split_per_leaf_, + learner_state); + } + + std::pair ShouldKeepGoingLeftRight( + const Tree *tree, int node_idx, const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split, const Dataset *train_data_) { + int inner_feature = tree->split_feature_inner(node_idx); + uint32_t threshold = tree->threshold_in_bin(node_idx); + bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; + + bool keep_going_right = true; + bool keep_going_left = true; + // left and right nodes are checked to find out if they are contiguous with the original leaf + // if so the algorithm should keep going down these nodes to update constraints + for (unsigned int i = 0; i < features.size(); ++i) { + if (features[i] == inner_feature) { + if (is_split_numerical) { + if (threshold >= thresholds[i] && !is_in_right_split[i]) { + keep_going_right = false; + } + if (threshold <= thresholds[i] && is_in_right_split[i]) { + keep_going_left = false; + } + } + } + } + return std::pair(keep_going_left, keep_going_right); + } + const ConstraintEntry& Get(int leaf_idx) const { return entries_[leaf_idx]; } + std::vector leaves_to_update; + private: int num_leaves_; std::vector entries_; diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 528063c78a32..14d6af8c8031 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -618,13 +618,47 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); + + // constraints are updated differently depending on the method used + if (!config_->monotone_constraints.empty() && + tree->leaf_is_in_monotone_subtree(*right_leaf)) { + if (config_->monotone_constraints_method == "basic") { + constraints_->UpdateConstraintsWithMid( + is_numerical_split, *left_leaf, *right_leaf, + best_split_info.monotone_type, best_split_info.right_output, + best_split_info.left_output); + + } else if (config_->monotone_constraints_method == "intermediate") { + constraints_->UpdateConstraintsWithOutputs( + is_numerical_split, *left_leaf, *right_leaf, + best_split_info.monotone_type, best_split_info.right_output, + best_split_info.left_output); + + LearnerState learner_state(config_, train_data_, tree, cegb_); + + // if there is a monotone split above, new values should + // be checked not to clash with existing constraints in the subtree; + // if they do, the existing splits need to be updated + constraints_->GoUpToFindLeavesToUpdate( + tree->leaf_parent(*right_leaf), inner_feature_index, best_split_info, + best_split_info.threshold, best_split_per_leaf_, learner_state); + + // Update splits that just got their constraints updated + while (!constraints_->leaves_to_update.empty()) { + int leaf_idx = constraints_->leaves_to_update.back(); + UpdateBestSplitsFromHistograms( + &best_split_per_leaf_[leaf_idx], leaf_idx, is_feature_used_, + num_features_, histogram_pool_, learner_state); + constraints_->leaves_to_update.pop_back(); + } + } else { + throw "monotone_constraints_methode has to be one of ('basic', " + "'intermediate')"; // FIXME improve how this exception is handled + } } - constraints_->UpdateConstraints(is_numerical_split, *left_leaf, *right_leaf, - best_split_info.monotone_type, - best_split_info.right_output, - best_split_info.left_output); } +>>>>>>> Add the intermediate constraining method. void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { if (obj != nullptr && obj->IsRenewTreeOutput()) { @@ -687,4 +721,51 @@ void SerialTreeLearner::ComputeBestSplitForFeature( } } + +// this function updates the best split for a leaf +// it is called only when monotone constraints exist +void SerialTreeLearner::UpdateBestSplitsFromHistograms( + SplitInfo *split, int leaf, + const std::vector &is_feature_used_, + int num_features_, HistogramPool &histogram_pool_, + LearnerState &learner_state) { + // the feature histogram is retrieved + FeatureHistogram *histogram_array_; + histogram_pool_.Get(leaf, &histogram_array_); + double sum_gradients = split->left_sum_gradient + split->right_sum_gradient; + double sum_hessians = split->left_sum_hessian + split->right_sum_hessian; + int num_data = split->left_count + split->right_count; + split->gain = kMinScore; + + OMP_INIT_EX(); +#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) + for (int feature_index = 0; feature_index < num_features_; ++feature_index) { + OMP_LOOP_EX_BEGIN(); + // splits are computed for the feature that are to be used + if (!histogram_array_[feature_index].is_splittable()) { + continue; + } + + // loop through the features to find the best one just like in the + // FindBestSplitsFromHistograms function + int real_fidx = learner_state.train_data_->RealFeatureIndex(feature_index); + LeafSplits *leaf_splits = new LeafSplits(0); + leaf_splits->Init(leaf, sum_gradients, + sum_hessians); + + // best split is updated + ComputeBestSplitForFeature( + histogram_array_, + feature_index, + real_fidx, + is_feature_used_[feature_index], // FIXME is this the right parameter to pass here? + num_data, + leaf_splits, + split); + + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); +} + } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 2223b3e247bf..23d8246ec913 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -122,6 +122,13 @@ class SerialTreeLearner: public TreeLearner { void GetShareStates(const Dataset* dataset, bool is_constant_hessian, bool is_first_time); + + void UpdateBestSplitsFromHistograms( + SplitInfo *split, int leaf, const std::vector &is_feature_used_, + int num_features_, HistogramPool &histogram_pool_, + LearnerState &learner_state); + + /*! * \brief Some initial works before training */ From 3bcb331030d0f7dfcd69fb16168fca18b7c5ed3c Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 18 Feb 2020 14:17:17 +0000 Subject: [PATCH 04/47] Updated tests. --- tests/python_package_test/test_engine.py | 71 ++++++++++++++++-------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 976f15275810..31b0897bc61e 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -995,45 +995,68 @@ def test_init_with_subset(self): self.assertEqual(subset_data_3.get_data(), "lgb_train_data.bin") self.assertEqual(subset_data_4.get_data(), "lgb_train_data.bin") - def test_monotone_constraint(self): + # test if categorical features and monotone features can both be in a dataset without causing issues + def generate_trainset_for_monotone_constraints_tests(self): + number_of_dpoints = 3000 + x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints) + x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) + x3_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) + x = np.column_stack( + (x1_positively_correlated_with_y, x2_negatively_correlated_with_y, x3_negatively_correlated_with_y)) + zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints) + scales = 10. * (np.random.random(6) + 0.5) + y = (scales[0] * x1_positively_correlated_with_y + + np.sin(scales[1] * np.pi * x1_positively_correlated_with_y) + - scales[2] * x2_negatively_correlated_with_y + - np.cos(scales[3] * np.pi * x2_negatively_correlated_with_y) + - scales[4] * x3_negatively_correlated_with_y + - np.cos(scales[5] * np.pi * x3_negatively_correlated_with_y) + + zs) + trainset = lgb.Dataset(x, label=y) + return trainset + + def test_monotone_constraints(self): def is_increasing(y): return (np.diff(y) >= 0.0).all() def is_decreasing(y): return (np.diff(y) <= 0.0).all() + def is_non_monotone(y): + return (np.diff(y) < 0.0).any() and (np.diff(y) > 0.0).any() + def is_correctly_constrained(learner): - n = 200 + iterations = 10 + n = 1000 variable_x = np.linspace(0, 1, n).reshape((n, 1)) fixed_xs_values = np.linspace(0, 1, n) - for i in range(n): + for i in range(iterations): fixed_x = fixed_xs_values[i] * np.ones((n, 1)) - monotonically_increasing_x = np.column_stack((variable_x, fixed_x)) + monotonically_increasing_x = np.column_stack((variable_x, fixed_x, fixed_x)) monotonically_increasing_y = learner.predict(monotonically_increasing_x) - monotonically_decreasing_x = np.column_stack((fixed_x, variable_x)) + monotonically_decreasing_x = np.column_stack((fixed_x, variable_x, fixed_x)) monotonically_decreasing_y = learner.predict(monotonically_decreasing_x) - if not (is_increasing(monotonically_increasing_y) and is_decreasing(monotonically_decreasing_y)): + non_monotone_x = np.column_stack((fixed_x, fixed_x, variable_x)) + non_monotone_y = learner.predict(non_monotone_x) + if not (is_increasing(monotonically_increasing_y) and + is_decreasing(monotonically_decreasing_y) and + is_non_monotone(non_monotone_y)): return False return True - number_of_dpoints = 2000 - x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints) - x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) - x = np.column_stack((x1_positively_correlated_with_y, x2_negatively_correlated_with_y)) - zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints) - y = (5 * x1_positively_correlated_with_y - + np.sin(10 * np.pi * x1_positively_correlated_with_y) - - 5 * x2_negatively_correlated_with_y - - np.cos(10 * np.pi * x2_negatively_correlated_with_y) - + zs) - trainset = lgb.Dataset(x, label=y) - params = { - 'min_data': 20, - 'num_leaves': 20, - 'monotone_constraints': '1,-1' - } - constrained_model = lgb.train(params, trainset) - self.assertTrue(is_correctly_constrained(constrained_model)) + number_of_trials = 10 + for _ in range(number_of_trials): + for monotone_constraints_method in ["basic", "intermediate"]: + trainset = self.generate_trainset_for_monotone_constraints_tests() + params = { + 'min_data': 20, + 'num_leaves': 20, + 'monotone_constraints': '1,-1,0', + "monotone_constraints_method": monotone_constraints_method, + "use_missing": False, + } + constrained_model = lgb.train(params, trainset) + self.assertTrue(is_correctly_constrained(constrained_model)) def test_max_bin_by_feature(self): col1 = np.arange(0, 100)[:, np.newaxis] From e3ac0387875143447527efdbe6a0b96a109a6e98 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 18 Feb 2020 15:34:09 +0000 Subject: [PATCH 05/47] Minor fixes. --- src/treelearner/monotone_constraints.hpp | 33 ++++++++++++------------ src/treelearner/serial_tree_learner.cpp | 2 +- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index dfb8728003a1..43ac6dbfb8ab 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -126,11 +126,13 @@ class LeafConstraints { // only branches containing leaves that are contiguous to the original // leaf need to be updated - for (unsigned int i = 0; i < features.size(); ++i) { - if ((features[i] == inner_feature && is_split_numerical) && - (is_in_right_split[i] == is_right_split)) { - split_contains_new_information = false; - break; + if (is_split_numerical) { + for (unsigned int i = 0; i < features.size(); ++i) { + if (features[i] == inner_feature && + (is_in_right_split[i] == is_right_split)) { + split_contains_new_information = false; + break; + } } } @@ -191,7 +193,7 @@ class LeafConstraints { } std::pair min_max_constraints; - bool something_changed; + bool something_changed = false; if (use_right_leaf && use_left_leaf) { min_max_constraints = std::minmax(split_info.right_output, split_info.left_output); @@ -251,16 +253,14 @@ class LeafConstraints { learner_state.tree->left_child(node_idx), features, thresholds, is_in_right_split, maximum, split_feature, split_info, use_left_leaf, use_right_leaf_for_update && use_right_leaf, - split_threshold, best_split_per_leaf_, - learner_state); + split_threshold, best_split_per_leaf_, learner_state); } if (keep_going_left_right.second) { GoDownToFindLeavesToUpdate( learner_state.tree->right_child(node_idx), features, thresholds, is_in_right_split, maximum, split_feature, split_info, use_left_leaf_for_update && use_left_leaf, use_right_leaf, - split_threshold, best_split_per_leaf_, - learner_state); + split_threshold, best_split_per_leaf_, learner_state); } } } @@ -271,8 +271,7 @@ class LeafConstraints { std::vector &best_split_per_leaf_, LearnerState &learner_state) { int depth = learner_state.tree->leaf_depth( - ~learner_state.tree->left_child(node_idx)) - - 1; + ~learner_state.tree->left_child(node_idx)) - 1; std::vector features; std::vector thresholds; @@ -294,16 +293,16 @@ class LeafConstraints { const std::vector &is_in_right_split, const Dataset *train_data_) { int inner_feature = tree->split_feature_inner(node_idx); uint32_t threshold = tree->threshold_in_bin(node_idx); - bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) - ->bin_type() == BinType::NumericalBin; + bool is_split_numerical = + train_data_->FeatureBinMapper(inner_feature)->bin_type() == BinType::NumericalBin; bool keep_going_right = true; bool keep_going_left = true; // left and right nodes are checked to find out if they are contiguous with the original leaf // if so the algorithm should keep going down these nodes to update constraints - for (unsigned int i = 0; i < features.size(); ++i) { - if (features[i] == inner_feature) { - if (is_split_numerical) { + if (is_split_numerical) { + for (unsigned int i = 0; i < features.size(); ++i) { + if (features[i] == inner_feature) { if (threshold >= thresholds[i] && !is_in_right_split[i]) { keep_going_right = false; } diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 14d6af8c8031..b44586473d89 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -652,7 +652,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, constraints_->leaves_to_update.pop_back(); } } else { - throw "monotone_constraints_methode has to be one of ('basic', " + throw "monotone_constraints_method has to be one of ('basic', " "'intermediate')"; // FIXME improve how this exception is handled } } From 0b5a8b18b58b1b99b4aa5ee078af989a9dcec215 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 18 Feb 2020 16:41:05 +0000 Subject: [PATCH 06/47] Typo. --- docs/Parameters.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 0625998f1820..81e3ee06a65f 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -376,7 +376,7 @@ Learning Control Parameters - dropout rate: a fraction of previous trees to drop during the dropout -- ``monotone_constraints_method`` :raw-html:`🔗︎`, default = ``basic``, type = string, aliases: ``monotone_constraining_method``, constraints: ``0.0 <= monotone_penalty (< max_depth, if max_depth > 0)`` +- ``monotone_constraints_method`` :raw-html:`🔗︎`, default = ``basic``, type = string, aliases: ``monotone_constraining_method`` - used only if ``monotone_constraints`` is set From 20294b5e9ced02c37223d8465ca6aae7ac2bf51f Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 18 Feb 2020 17:09:12 +0000 Subject: [PATCH 07/47] Linting. --- tests/python_package_test/test_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 31b0897bc61e..d660e6f5e713 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1038,9 +1038,9 @@ def is_correctly_constrained(learner): monotonically_decreasing_y = learner.predict(monotonically_decreasing_x) non_monotone_x = np.column_stack((fixed_x, fixed_x, variable_x)) non_monotone_y = learner.predict(non_monotone_x) - if not (is_increasing(monotonically_increasing_y) and - is_decreasing(monotonically_decreasing_y) and - is_non_monotone(non_monotone_y)): + if not (is_increasing(monotonically_increasing_y) + and is_decreasing(monotonically_decreasing_y) + and is_non_monotone(non_monotone_y)): return False return True From 06e36d3f4a81673e2555bea9654b8180fae37be8 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 19 Feb 2020 13:45:09 +0000 Subject: [PATCH 08/47] Ran the parameter generator for the doc. --- docs/Parameters.rst | 5 ++--- src/io/config_auto.cpp | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 81e3ee06a65f..101b76c28e17 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -376,12 +376,11 @@ Learning Control Parameters - dropout rate: a fraction of previous trees to drop during the dropout -- ``monotone_constraints_method`` :raw-html:`🔗︎`, default = ``basic``, type = string, aliases: ``monotone_constraining_method`` +- ``monotone_constraints_method`` :raw-html:`🔗︎`, default = ``basic``, type = string, aliases: ``monotone_constraining_method`` - used only if ``monotone_constraints`` is set - - monotone constraints method: if set to "basic", the most basic monotone constraints method will be used. It does not slow the library at all, but over-constrains the predictions. If set to "intermediate", a more advanced method will be used, which may very slightly slow the library. However, the intermediate method is much less constraining than the basic method and should significantly improve the results. - + - monotone constraints method: if set to "basic", the most basic monotone constraints method will be used. It does not slow the library at all, but over-constrains the predictions. If set to "intermediate", a more advanced method will be used, which may slow the library very slightly. However, the intermediate method is much less constraining than the basic method and should significantly improve the results. - ``max_drop`` :raw-html:`🔗︎`, default = ``50``, type = int diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 9ec36f3efbae..bdef7a569d2c 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -622,8 +622,8 @@ std::string Config::SaveMembersToString() const { str_buf << "[lambda_l1: " << lambda_l1 << "]\n"; str_buf << "[lambda_l2: " << lambda_l2 << "]\n"; str_buf << "[min_gain_to_split: " << min_gain_to_split << "]\n"; - str_buf << "[monotone_constraints_method: " << monotone_constraints_method << "]\n"; str_buf << "[drop_rate: " << drop_rate << "]\n"; + str_buf << "[monotone_constraints_method: " << monotone_constraints_method << "]\n"; str_buf << "[max_drop: " << max_drop << "]\n"; str_buf << "[skip_drop: " << skip_drop << "]\n"; str_buf << "[xgboost_dart_mode: " << xgboost_dart_mode << "]\n"; From 81f8a914b55d2de4d1f3e471c52f0cd1e9daf6d3 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 20 Feb 2020 08:55:21 +0000 Subject: [PATCH 09/47] Removed usage of the FeatureMonotone function. --- src/treelearner/monotone_constraints.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 43ac6dbfb8ab..c8b23720bcd4 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -115,8 +115,9 @@ class LeafConstraints { int parent_idx = learner_state.tree->node_parent(node_idx); if (parent_idx != -1) { int inner_feature = learner_state.tree->split_feature_inner(parent_idx); + int feature = learner_state.tree->split_feature(parent_idx); int8_t monotone_type = - learner_state.train_data_->FeatureMonotone(inner_feature); + learner_state.config_->monotone_constraints[feature]; bool is_right_split = learner_state.tree->right_child(parent_idx) == node_idx; bool split_contains_new_information = true; From 590cc42783d623fe0ae4a3759a6868b2cccbb607 Mon Sep 17 00:00:00 2001 From: guolinke Date: Fri, 28 Feb 2020 17:08:24 +0800 Subject: [PATCH 10/47] more fixes --- include/LightGBM/tree.h | 95 +++------ src/io/config.cpp | 5 + src/io/tree.cpp | 9 +- src/treelearner/monotone_constraints.hpp | 249 +++++++++++++---------- src/treelearner/serial_tree_learner.cpp | 108 ++++------ src/treelearner/serial_tree_learner.h | 9 +- 6 files changed, 224 insertions(+), 251 deletions(-) diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 38390f8b329d..f43a3987531d 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -60,7 +60,7 @@ class Tree { int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, double threshold_double, double left_value, double right_value, int left_cnt, int right_cnt, double left_weight, double right_weight, - float gain, MissingType missing_type, bool default_left, bool feature_is_monotone); + float gain, MissingType missing_type, bool default_left); /*! * \brief Performing a split on tree leaves, with categorical feature @@ -134,26 +134,16 @@ class Tree { inline int PredictLeafIndex(const double* feature_values) const; inline int PredictLeafIndexByMap(const std::unordered_map& feature_values) const; +<<<<<<< HEAD <<<<<<< HEAD ======= // Get node parent inline int node_parent(int node_idx) const; // Get leaf parent inline int leaf_parent(int node_idx) const; +======= +>>>>>>> more fixes - // Get children - inline int left_child(int node_idx) const; - inline int right_child(int node_idx) const; - - // Get if the feature is in a monotone subtree - inline bool leaf_is_in_monotone_subtree(int leaf_idx) const; - - inline double internal_value(int node_idx) const; - - inline uint32_t threshold_in_bin(int node_idx) const; - - // Get the feature corresponding to the split - inline int split_feature_inner(int node_idx) const; >>>>>>> Add util functions. inline void PredictContrib(const double* feature_values, int num_features, double* output); @@ -169,6 +159,31 @@ class Tree { inline double split_gain(int split_idx) const { return split_gain_[split_idx]; } + inline double Tree::internal_value(int node_idx) const { + return internal_value_[node_idx]; + } + + inline bool IsNumericalSplit(int node_idx) const { + return !GetDecisionType(decision_type_[node_idx], kCategoricalMask); + } + + inline int left_child(int node_idx) const { return left_child_[node_idx]; } + + inline int right_child(int node_idx) const { return right_child_[node_idx]; } + + inline int split_feature_inner(int node_idx) const { + return split_feature_inner_[node_idx]; + } + + inline int leaf_parent(int leaf_idx) const { return leaf_parent_[leaf_idx]; } + + inline uint32_t threshold_in_bin(int node_idx) const { +#ifdef DEBUG + CHECK(node_idx >= 0); +#endif + return threshold_in_bin_[node_idx]; + } + /*! \brief Get the number of data points that fall at or below this node*/ inline int data_count(int node) const { return node >= 0 ? internal_count_[node] : leaf_count_[~node]; } @@ -348,7 +363,7 @@ class Tree { } inline void Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt, - double left_weight, double right_weight, float gain, bool feature_is_monotone); + double left_weight, double right_weight, float gain); /*! * \brief Find leaf index of which record belongs by features * \param feature_values Feature value of this record @@ -447,22 +462,12 @@ class Tree { std::vector leaf_depth_; double shrinkage_; int max_depth_; - // add parent node information - std::vector node_parent_; - // Keeps track of the monotone splits above the leaf - std::vector leaf_is_in_monotone_subtree_; }; inline void Tree::Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt, - double left_weight, double right_weight, float gain, bool feature_is_monotone) { + double left_weight, double right_weight, float gain) { int new_node_idx = num_leaves_ - 1; - - // Update if there is a monotone split above the leaf - if (feature_is_monotone || leaf_is_in_monotone_subtree_[leaf]) { - leaf_is_in_monotone_subtree_[leaf] = true; - leaf_is_in_monotone_subtree_[num_leaves_] = true; - } // update parent info int parent = leaf_parent_[leaf]; if (parent >= 0) { @@ -476,8 +481,6 @@ inline void Tree::Split(int leaf, int feature, int real_feature, // add new node split_feature_inner_[new_node_idx] = feature; split_feature_[new_node_idx] = real_feature; - node_parent_[new_node_idx] = parent; - split_gain_[new_node_idx] = gain; // add two new leaves left_child_[new_node_idx] = ~leaf; @@ -585,42 +588,6 @@ inline int Tree::GetLeafByMap(const std::unordered_map& feature_val return ~node; } -inline int Tree::node_parent(int node_idx) const{ - return node_parent_[node_idx]; -} - -inline int Tree::left_child(int node_idx) const{ - return left_child_[node_idx]; -} - -inline int Tree::right_child(int node_idx) const{ - return right_child_[node_idx]; -} - -inline int Tree::split_feature_inner(int node_idx) const{ - return split_feature_inner_[node_idx]; -} - -inline int Tree::leaf_parent(int node_idx) const{ - return leaf_parent_[node_idx]; -} - -inline uint32_t Tree::threshold_in_bin(int node_idx) const{ - #ifdef DEBUG - CHECK(node_idx >= 0); - #endif - return threshold_in_bin_[node_idx]; -} - -inline bool Tree::leaf_is_in_monotone_subtree(int leaf_idx) const { - return leaf_is_in_monotone_subtree_[leaf_idx]; -} - -inline double Tree::internal_value(int node_idx) const { - return internal_value_[node_idx]; -} - - } // namespace LightGBM #endif // LightGBM_TREE_H_ diff --git a/src/io/config.cpp b/src/io/config.cpp index 3aeb82b0eaac..fd2856ede94a 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -317,6 +317,11 @@ void Config::CheckParamConflict() { force_col_wise = true; force_row_wise = false; } + if (is_parallel && monotone_constraints_method == std::string("intermediate")) { + // In distributed mode, local node doesn't have histograms on all features, cannot perform "intermediate" monotone constraints. + Log::Warning("cannot use \"intermediate\" monotone constraints in parallel learning, auto set to \"basic\" method."); + monotone_constraints_method == "basic"; + } } std::string Config::ToString() const { diff --git a/src/io/tree.cpp b/src/io/tree.cpp index 42519ffb727a..5b5e24a2321c 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -24,9 +24,7 @@ Tree::Tree(int max_leaves) threshold_.resize(max_leaves_ - 1); decision_type_.resize(max_leaves_ - 1, 0); split_gain_.resize(max_leaves_ - 1); - node_parent_.resize(max_leaves_ - 1); leaf_parent_.resize(max_leaves_); - leaf_is_in_monotone_subtree_.resize(max_leaves_); leaf_value_.resize(max_leaves_); leaf_weight_.resize(max_leaves_); leaf_count_.resize(max_leaves_); @@ -40,7 +38,6 @@ Tree::Tree(int max_leaves) leaf_value_[0] = 0.0f; leaf_weight_[0] = 0.0f; leaf_parent_[0] = -1; - node_parent_.resize(max_leaves_ - 1); shrinkage_ = 1.0f; num_cat_ = 0; cat_boundaries_.push_back(0); @@ -54,8 +51,8 @@ Tree::~Tree() { int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, double threshold_double, double left_value, double right_value, int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, - MissingType missing_type, bool default_left, bool feature_was_monotone) { - Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain, feature_was_monotone); + MissingType missing_type, bool default_left) { + Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain); int new_node_idx = num_leaves_ - 1; decision_type_[new_node_idx] = 0; SetDecisionType(&decision_type_[new_node_idx], false, kCategoricalMask); @@ -76,7 +73,7 @@ int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin, const uint32_t* threshold, int num_threshold, double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type) { - Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain, false); + Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain); int new_node_idx = num_leaves_ - 1; decision_type_[new_node_idx] = 0; SetDecisionType(&decision_type_[new_node_idx], true, kCategoricalMask); diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index c8b23720bcd4..3df4a1373253 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -1,32 +1,19 @@ /*! * Copyright (c) 2020 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. */ #ifndef LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_HPP_ #define LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_HPP_ -#include #include #include +#include #include #include "split_info.hpp" namespace LightGBM { -struct CostEfficientGradientBoosting; - -struct LearnerState { - const Config *config_; - const Dataset *train_data_; - const Tree *tree; - std::unique_ptr &cegb_; - - LearnerState(const Config *config_, const Dataset *train_data_, - const Tree *tree, - std::unique_ptr &cegb_) - : config_(config_), train_data_(train_data_), tree(tree), cegb_(cegb_) {}; -}; - struct ConstraintEntry { double min = -std::numeric_limits::max(); double max = std::numeric_limits::max(); @@ -59,23 +46,41 @@ struct ConstraintEntry { } }; -template -class LeafConstraints { +class LeafConstraintsBase { public: - explicit LeafConstraints(int num_leaves) : num_leaves_(num_leaves) { + virtual ~LeafConstraintsBase(){}; + virtual const ConstraintEntry &Get(int leaf_idx) const = 0; + virtual void Reset() = 0; + virtual void BeforeSplit(const Tree *tree, int leaf, int new_leaf, + int8_t monotone_type) = 0; + virtual std::vector Update( + const Tree *tree, bool is_numerical_split, + int leaf, int new_leaf, int8_t monotone_type, double right_output, + double left_output, int split_feature, const SplitInfo &split_info, + const std::vector &best_split_per_leaf) = 0; + + inline static LeafConstraintsBase *Create(const Config *config, int num_leaves); +}; + +class BasicLeafConstraints : public LeafConstraintsBase { + public: + explicit BasicLeafConstraints(int num_leaves) : num_leaves_(num_leaves) { entries_.resize(num_leaves_); - leaves_to_update.reserve(num_leaves_); } - void Reset() { - for (auto& entry : entries_) { + void Reset() override { + for (auto &entry : entries_) { entry.Reset(); } } - void UpdateConstraintsWithMid(bool is_numerical_split, int leaf, int new_leaf, - int8_t monotone_type, double right_output, - double left_output) { + void BeforeSplit(const Tree *, int, int, int8_t) override {} + + std::vector Update(const Tree *, + bool is_numerical_split, int leaf, int new_leaf, + int8_t monotone_type, double right_output, + double left_output, int, const SplitInfo &, + const std::vector &) override { entries_[new_leaf] = entries_[leaf]; if (is_numerical_split) { double mid = (left_output + right_output) / 2.0f; @@ -87,6 +92,39 @@ class LeafConstraints { entries_[new_leaf].UpdateMin(mid); } } + return std::vector(); + } + + const ConstraintEntry &Get(int leaf_idx) const { return entries_[leaf_idx]; } + + private: + int num_leaves_; + std::vector entries_; +}; + +class FastLeafConstraints : public BasicLeafConstraints { + public: + explicit FastLeafConstraints(const Config *config, int num_leaves) + : BasicLeafConstraints(num_leaves), config_(config) { + leaf_is_in_monotone_subtree_.resize(num_leaves_, false); + node_parent_.resize(num_leaves_, 0); + leaves_to_update_.reserve(num_leaves_); + } + + void Reset() override { + BasicLeafConstraints::Reset(); + std::fill_n(leaf_is_in_monotone_subtree_.begin(), num_leaves_, false); + std::fill_n(node_parent_.begin(), num_leaves_, 0); + leaves_to_update_.clear(); + } + + void BeforeSplit(const Tree *tree, int leaf, int new_leaf, + int8_t monotone_type) override { + if (monotone_type != 0 || leaf_is_in_monotone_subtree_[leaf]) { + leaf_is_in_monotone_subtree_[leaf] = true; + leaf_is_in_monotone_subtree_[new_leaf] = true; + } + node_parent_[new_leaf - 1] = tree->leaf_parent(leaf); } void UpdateConstraintsWithOutputs(bool is_numerical_split, int leaf, @@ -104,26 +142,34 @@ class LeafConstraints { } } - void GoUpToFindLeavesToUpdate(int node_idx, std::vector &features, - std::vector &thresholds, - std::vector &is_in_right_split, - int split_feature, const SplitInfo &split_info, - uint32_t split_threshold, - std::vector &best_split_per_leaf_, - LearnerState &learner_state) { + std::vector Update(const Tree *tree, bool is_numerical_split, int leaf, + int new_leaf, int8_t monotone_type, + double right_output, double left_output, + int split_feature, const SplitInfo &split_info, + const std::vector &best_split_per_leaf) { + leaves_to_update_.clear(); + UpdateConstraintsWithOutputs(is_numerical_split, leaf, new_leaf, + monotone_type, right_output, left_output); + + GoUpToFindLeavesToUpdate(tree, tree->leaf_parent(new_leaf), split_feature, + split_info, split_info.threshold, + best_split_per_leaf); + return leaves_to_update_; + } - int parent_idx = learner_state.tree->node_parent(node_idx); + void GoUpToFindLeavesToUpdate( + const Tree *tree, int node_idx, std::vector &features, + std::vector &thresholds, std::vector &is_in_right_split, + int split_feature, const SplitInfo &split_info, uint32_t split_threshold, + const std::vector &best_split_per_leaf) { + int parent_idx = node_parent_[node_idx]; if (parent_idx != -1) { - int inner_feature = learner_state.tree->split_feature_inner(parent_idx); - int feature = learner_state.tree->split_feature(parent_idx); - int8_t monotone_type = - learner_state.config_->monotone_constraints[feature]; - bool is_right_split = - learner_state.tree->right_child(parent_idx) == node_idx; + int inner_feature = tree->split_feature_inner(parent_idx); + int feature = tree->split_feature(parent_idx); + int8_t monotone_type = config_->monotone_constraints[feature]; + bool is_right_split = tree->right_child(parent_idx) == node_idx; bool split_contains_new_information = true; - bool is_split_numerical = - learner_state.train_data_->FeatureBinMapper(inner_feature) - ->bin_type() == BinType::NumericalBin; + bool is_split_numerical = tree->IsNumericalSplit(node_idx); // only branches containing leaves that are contiguous to the original // leaf need to be updated @@ -139,57 +185,46 @@ class LeafConstraints { if (split_contains_new_information) { if (monotone_type != 0) { - int left_child_idx = learner_state.tree->left_child(parent_idx); - int right_child_idx = learner_state.tree->right_child(parent_idx); + int left_child_idx = tree->left_child(parent_idx); + int right_child_idx = tree->right_child(parent_idx); bool left_child_is_curr_idx = (left_child_idx == node_idx); int node_idx_to_pass = (left_child_is_curr_idx) ? right_child_idx : left_child_idx; bool take_min = (monotone_type < 0) ? left_child_is_curr_idx : !left_child_is_curr_idx; - GoDownToFindLeavesToUpdate( - node_idx_to_pass, features, thresholds, is_in_right_split, - take_min, split_feature, split_info, true, true, split_threshold, - best_split_per_leaf_, - learner_state); + GoDownToFindLeavesToUpdate(tree, node_idx_to_pass, features, + thresholds, is_in_right_split, take_min, + split_feature, split_info, true, true, + split_threshold, best_split_per_leaf); } - is_in_right_split.push_back( - learner_state.tree->right_child(parent_idx) == node_idx); - thresholds.push_back(learner_state.tree->threshold_in_bin(parent_idx)); - features.push_back(learner_state.tree->split_feature_inner(parent_idx)); + is_in_right_split.push_back(tree->right_child(parent_idx) == node_idx); + thresholds.push_back(tree->threshold_in_bin(parent_idx)); + features.push_back(tree->split_feature_inner(parent_idx)); } if (parent_idx != 0) { - LeafConstraints::GoUpToFindLeavesToUpdate( - parent_idx, features, thresholds, is_in_right_split, split_feature, - split_info, split_threshold, best_split_per_leaf_, - learner_state); + GoUpToFindLeavesToUpdate(tree, parent_idx, features, thresholds, + is_in_right_split, split_feature, split_info, + split_threshold, best_split_per_leaf); } } } void GoDownToFindLeavesToUpdate( - int node_idx, const std::vector &features, + const Tree *tree, int node_idx, const std::vector &features, const std::vector &thresholds, const std::vector &is_in_right_split, int maximum, int split_feature, const SplitInfo &split_info, bool use_left_leaf, bool use_right_leaf, uint32_t split_threshold, - std::vector &best_split_per_leaf_, - LearnerState &learner_state) { - + const std::vector &best_split_per_leaf) { if (node_idx < 0) { int leaf_idx = ~node_idx; - // if a leaf is at max depth then there is no need to update it - int max_depth = learner_state.config_->max_depth; - if (learner_state.tree->leaf_depth(leaf_idx) >= max_depth && - max_depth > 0) { - return; - } - - // splits that are not to be used shall not be updated - if (best_split_per_leaf_[leaf_idx].gain == kMinScore) { + // splits that are not to be used shall not be updated, included leaf at + // max depth + if (best_split_per_leaf[leaf_idx].gain == kMinScore) { return; } @@ -226,18 +261,15 @@ class LeafConstraints { if (!something_changed) { return; } - leaves_to_update.push_back(leaf_idx); + leaves_to_update_.push_back(leaf_idx); } else { // check if the children are contiguous with the original leaf std::pair keep_going_left_right = ShouldKeepGoingLeftRight( - learner_state.tree, node_idx, features, thresholds, is_in_right_split, - learner_state.train_data_); - int inner_feature = learner_state.tree->split_feature_inner(node_idx); - uint32_t threshold = learner_state.tree->threshold_in_bin(node_idx); - bool is_split_numerical = - learner_state.train_data_->FeatureBinMapper(inner_feature) - ->bin_type() == BinType::NumericalBin; + tree, node_idx, features, thresholds, is_in_right_split); + int inner_feature = tree->split_feature_inner(node_idx); + uint32_t threshold = tree->threshold_in_bin(node_idx); + bool is_split_numerical = tree->IsNumericalSplit(node_idx); bool use_left_leaf_for_update = true; bool use_right_leaf_for_update = true; if (is_split_numerical && inner_feature == split_feature) { @@ -250,29 +282,27 @@ class LeafConstraints { } if (keep_going_left_right.first) { - GoDownToFindLeavesToUpdate( - learner_state.tree->left_child(node_idx), features, thresholds, - is_in_right_split, maximum, split_feature, split_info, - use_left_leaf, use_right_leaf_for_update && use_right_leaf, - split_threshold, best_split_per_leaf_, learner_state); + GoDownToFindLeavesToUpdate(tree, tree->left_child(node_idx), features, + thresholds, is_in_right_split, maximum, + split_feature, split_info, use_left_leaf, + use_right_leaf_for_update && use_right_leaf, + split_threshold, best_split_per_leaf); } if (keep_going_left_right.second) { GoDownToFindLeavesToUpdate( - learner_state.tree->right_child(node_idx), features, thresholds, + tree, tree->right_child(node_idx), features, thresholds, is_in_right_split, maximum, split_feature, split_info, use_left_leaf_for_update && use_left_leaf, use_right_leaf, - split_threshold, best_split_per_leaf_, learner_state); + split_threshold, best_split_per_leaf); } } } - void GoUpToFindLeavesToUpdate(int node_idx, int split_feature, - const SplitInfo &split_info, - uint32_t split_threshold, - std::vector &best_split_per_leaf_, - LearnerState &learner_state) { - int depth = learner_state.tree->leaf_depth( - ~learner_state.tree->left_child(node_idx)) - 1; + void GoUpToFindLeavesToUpdate( + const Tree *tree, int node_idx, int split_feature, + const SplitInfo &split_info, uint32_t split_threshold, + const std::vector &best_split_per_leaf) { + int depth = tree->leaf_depth(~tree->left_child(node_idx)) - 1; std::vector features; std::vector thresholds; @@ -282,25 +312,24 @@ class LeafConstraints { thresholds.reserve(depth); is_in_right_split.reserve(depth); - GoUpToFindLeavesToUpdate( - node_idx, features, thresholds, is_in_right_split, split_feature, - split_info, split_threshold, best_split_per_leaf_, - learner_state); + GoUpToFindLeavesToUpdate(tree, node_idx, features, thresholds, + is_in_right_split, split_feature, split_info, + split_threshold, best_split_per_leaf); } std::pair ShouldKeepGoingLeftRight( const Tree *tree, int node_idx, const std::vector &features, const std::vector &thresholds, - const std::vector &is_in_right_split, const Dataset *train_data_) { + const std::vector &is_in_right_split) { int inner_feature = tree->split_feature_inner(node_idx); uint32_t threshold = tree->threshold_in_bin(node_idx); - bool is_split_numerical = - train_data_->FeatureBinMapper(inner_feature)->bin_type() == BinType::NumericalBin; + bool is_split_numerical = tree->IsNumericalSplit(node_idx); bool keep_going_right = true; bool keep_going_left = true; - // left and right nodes are checked to find out if they are contiguous with the original leaf - // if so the algorithm should keep going down these nodes to update constraints + // left and right nodes are checked to find out if they are contiguous with + // the original leaf if so the algorithm should keep going down these nodes + // to update constraints if (is_split_numerical) { for (unsigned int i = 0; i < features.size(); ++i) { if (features[i] == inner_feature) { @@ -316,14 +345,26 @@ class LeafConstraints { return std::pair(keep_going_left, keep_going_right); } - const ConstraintEntry& Get(int leaf_idx) const { return entries_[leaf_idx]; } - - std::vector leaves_to_update; + const ConstraintEntry &Get(int leaf_idx) const { return entries_[leaf_idx]; } private: + const Config *config_; int num_leaves_; std::vector entries_; + std::vector leaves_to_update_; + // add parent node information + std::vector node_parent_; + // Keeps track of the monotone splits above the leaf + std::vector leaf_is_in_monotone_subtree_; }; +LeafConstraintsBase *LeafConstraintsBase::Create(const Config *config, + int num_leaves) { + if (config->monotone_constraints_method == "intermediate") { + return new FastLeafConstraints(config, num_leaves); + } + return new BasicLeafConstraints(num_leaves); +} + } // namespace LightGBM #endif // LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_HPP_ diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index b44586473d89..d35975e76459 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -46,7 +46,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian // push split information for all leaves best_split_per_leaf_.resize(config_->num_leaves); - constraints_.reset(new LeafConstraints(config_->num_leaves)); + constraints_.reset(LeafConstraintsBase::Create(config_, config_->num_leaves)); // initialize splits for leaf smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_data())); @@ -144,6 +144,7 @@ void SerialTreeLearner::ResetConfig(const Config* config) { cegb_.reset(new CostEfficientGradientBoosting(this)); cegb_->Init(); } + constraints_.reset(LeafConstraintsBase::Create(config_, config_->num_leaves)); } Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) { @@ -533,6 +534,10 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, *left_leaf = best_leaf; auto next_leaf_id = tree->NextLeafId(); + // update before tree split + constraints_->BeforeSplit(tree, best_leaf, next_leaf_id, + best_split_info.monotone_type); + bool is_numerical_split = train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin; @@ -618,47 +623,18 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); - - // constraints are updated differently depending on the method used - if (!config_->monotone_constraints.empty() && - tree->leaf_is_in_monotone_subtree(*right_leaf)) { - if (config_->monotone_constraints_method == "basic") { - constraints_->UpdateConstraintsWithMid( - is_numerical_split, *left_leaf, *right_leaf, - best_split_info.monotone_type, best_split_info.right_output, - best_split_info.left_output); - - } else if (config_->monotone_constraints_method == "intermediate") { - constraints_->UpdateConstraintsWithOutputs( - is_numerical_split, *left_leaf, *right_leaf, - best_split_info.monotone_type, best_split_info.right_output, - best_split_info.left_output); - - LearnerState learner_state(config_, train_data_, tree, cegb_); - - // if there is a monotone split above, new values should - // be checked not to clash with existing constraints in the subtree; - // if they do, the existing splits need to be updated - constraints_->GoUpToFindLeavesToUpdate( - tree->leaf_parent(*right_leaf), inner_feature_index, best_split_info, - best_split_info.threshold, best_split_per_leaf_, learner_state); - - // Update splits that just got their constraints updated - while (!constraints_->leaves_to_update.empty()) { - int leaf_idx = constraints_->leaves_to_update.back(); - UpdateBestSplitsFromHistograms( - &best_split_per_leaf_[leaf_idx], leaf_idx, is_feature_used_, - num_features_, histogram_pool_, learner_state); - constraints_->leaves_to_update.pop_back(); - } - } else { - throw "monotone_constraints_method has to be one of ('basic', " - "'intermediate')"; // FIXME improve how this exception is handled - } + } + auto leaves_need_update = constraints_->Update( + tree, is_numerical_split, *left_leaf, *right_leaf, + best_split_info.monotone_type, best_split_info.right_output, + best_split_info.left_output, inner_feature_index, best_split_info, + best_split_per_leaf_); + // update leave outputs if needed + for (auto leaf: leaves_need_update) { + RecomputeBestSplitForLeaf(leaf, &best_split_per_leaf_[leaf]); } } ->>>>>>> Add the intermediate constraining method. void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { if (obj != nullptr && obj->IsRenewTreeOutput()) { @@ -721,51 +697,43 @@ void SerialTreeLearner::ComputeBestSplitForFeature( } } - -// this function updates the best split for a leaf -// it is called only when monotone constraints exist -void SerialTreeLearner::UpdateBestSplitsFromHistograms( - SplitInfo *split, int leaf, - const std::vector &is_feature_used_, - int num_features_, HistogramPool &histogram_pool_, - LearnerState &learner_state) { - // the feature histogram is retrieved - FeatureHistogram *histogram_array_; - histogram_pool_.Get(leaf, &histogram_array_); +void SerialTreeLearner::RecomputeBestSplitForLeaf(int leaf, SplitInfo* split) { + FeatureHistogram* histogram_array_; + if (!histogram_pool_.Get(leaf, &histogram_array_)) { + Log::Warning( + "Get historical Histogram for leaf %d failed, will skip the " + "``RecomputeBestSplitForLeaf``", + leaf); + return; + } double sum_gradients = split->left_sum_gradient + split->right_sum_gradient; double sum_hessians = split->left_sum_hessian + split->right_sum_hessian; int num_data = split->left_count + split->right_count; - split->gain = kMinScore; + + std::vector bests(num_threads_); + LeafSplits leaf_splits(num_data); + leaf_splits.Init(leaf, sum_gradients, sum_hessians); OMP_INIT_EX(); -#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) +// find splits +#pragma omp parallel for schedule(static) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { OMP_LOOP_EX_BEGIN(); - // splits are computed for the feature that are to be used - if (!histogram_array_[feature_index].is_splittable()) { + if (!is_feature_used_[feature_index]) { continue; } - - // loop through the features to find the best one just like in the - // FindBestSplitsFromHistograms function - int real_fidx = learner_state.train_data_->RealFeatureIndex(feature_index); - LeafSplits *leaf_splits = new LeafSplits(0); - leaf_splits->Init(leaf, sum_gradients, - sum_hessians); - - // best split is updated + const int tid = omp_get_thread_num(); + int real_fidx = train_data_->RealFeatureIndex(feature_index); ComputeBestSplitForFeature( - histogram_array_, - feature_index, - real_fidx, - is_feature_used_[feature_index], // FIXME is this the right parameter to pass here? - num_data, - leaf_splits, - split); + smaller_leaf_histogram_array_, feature_index, real_fidx, + true, // fixme: this cannot work with colsample_bynode, but do we really need to compute all features? why not just the used features of `split->feature`? + num_data, &leaf_splits, &bests[tid]); OMP_LOOP_EX_END(); } OMP_THROW_EX(); + auto best_idx = ArrayArgs::ArgMax(bests); + *split = bests[best_idx]; } } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 23d8246ec913..6a0d7f0e9a6d 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -122,12 +122,7 @@ class SerialTreeLearner: public TreeLearner { void GetShareStates(const Dataset* dataset, bool is_constant_hessian, bool is_first_time); - - void UpdateBestSplitsFromHistograms( - SplitInfo *split, int leaf, const std::vector &is_feature_used_, - int num_features_, HistogramPool &histogram_pool_, - LearnerState &learner_state); - + void RecomputeBestSplitForLeaf(int leaf, SplitInfo* split); /*! * \brief Some initial works before training @@ -195,7 +190,7 @@ class SerialTreeLearner: public TreeLearner { /*! \brief store best split per feature for all leaves */ std::vector splits_per_leaf_; /*! \brief stores minimum and maximum constraints for each leaf */ - std::unique_ptr> constraints_; + std::unique_ptr constraints_; /*! \brief stores best thresholds for all feature for smaller leaf */ std::unique_ptr smaller_leaf_splits_; From 6cb5b3c9c63323adb08216d161581ed2faf342f4 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 2 Mar 2020 08:59:14 +0000 Subject: [PATCH 11/47] Fix. --- include/LightGBM/tree.h | 2 +- src/treelearner/monotone_constraints.hpp | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index f43a3987531d..f422cbce0d47 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -159,7 +159,7 @@ class Tree { inline double split_gain(int split_idx) const { return split_gain_[split_idx]; } - inline double Tree::internal_value(int node_idx) const { + inline double internal_value(int node_idx) const { return internal_value_[node_idx]; } diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 3df4a1373253..e4db64219789 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -97,7 +97,7 @@ class BasicLeafConstraints : public LeafConstraintsBase { const ConstraintEntry &Get(int leaf_idx) const { return entries_[leaf_idx]; } - private: + protected: int num_leaves_; std::vector entries_; }; @@ -107,14 +107,14 @@ class FastLeafConstraints : public BasicLeafConstraints { explicit FastLeafConstraints(const Config *config, int num_leaves) : BasicLeafConstraints(num_leaves), config_(config) { leaf_is_in_monotone_subtree_.resize(num_leaves_, false); - node_parent_.resize(num_leaves_, 0); + node_parent_.resize(num_leaves_ - 1, -1); leaves_to_update_.reserve(num_leaves_); } void Reset() override { BasicLeafConstraints::Reset(); std::fill_n(leaf_is_in_monotone_subtree_.begin(), num_leaves_, false); - std::fill_n(node_parent_.begin(), num_leaves_, 0); + std::fill_n(node_parent_.begin(), num_leaves_ - 1, -1); leaves_to_update_.clear(); } @@ -349,8 +349,6 @@ class FastLeafConstraints : public BasicLeafConstraints { private: const Config *config_; - int num_leaves_; - std::vector entries_; std::vector leaves_to_update_; // add parent node information std::vector node_parent_; From 42c79f00507001fe5d0561402649989969d55c6b Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 3 Mar 2020 08:30:29 +0000 Subject: [PATCH 12/47] Remove duplicated code. --- src/treelearner/monotone_constraints.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index e4db64219789..dd83065d69da 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -345,8 +345,6 @@ class FastLeafConstraints : public BasicLeafConstraints { return std::pair(keep_going_left, keep_going_right); } - const ConstraintEntry &Get(int leaf_idx) const { return entries_[leaf_idx]; } - private: const Config *config_; std::vector leaves_to_update_; From d4fdc4341f0b9612c7fd9fd3511cd3a9a1aa5b25 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 3 Mar 2020 08:33:54 +0000 Subject: [PATCH 13/47] Add debug checks. --- src/treelearner/monotone_constraints.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index dd83065d69da..1e2fad5b2da7 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -124,6 +124,10 @@ class FastLeafConstraints : public BasicLeafConstraints { leaf_is_in_monotone_subtree_[leaf] = true; leaf_is_in_monotone_subtree_[new_leaf] = true; } +#ifdef DEBUG + CHECK(new_leaf - 1 >= 0); + CHECK(new_leaf - 1 < node_parent_.size()); +#endif node_parent_[new_leaf - 1] = tree->leaf_parent(leaf); } @@ -162,6 +166,10 @@ class FastLeafConstraints : public BasicLeafConstraints { std::vector &thresholds, std::vector &is_in_right_split, int split_feature, const SplitInfo &split_info, uint32_t split_threshold, const std::vector &best_split_per_leaf) { +#ifdef DEBUG + CHECK(node_idx >= 0); + CHECK(node_idx < node_parent_.size()); +#endif int parent_idx = node_parent_[node_idx]; if (parent_idx != -1) { int inner_feature = tree->split_feature_inner(parent_idx); From d26af9b6bd622d41e2310df1990a1a352518718b Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 3 Mar 2020 09:45:29 +0000 Subject: [PATCH 14/47] Typo. --- src/treelearner/serial_tree_learner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index d35975e76459..f8f7dd57c331 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -725,7 +725,7 @@ void SerialTreeLearner::RecomputeBestSplitForLeaf(int leaf, SplitInfo* split) { const int tid = omp_get_thread_num(); int real_fidx = train_data_->RealFeatureIndex(feature_index); ComputeBestSplitForFeature( - smaller_leaf_histogram_array_, feature_index, real_fidx, + histogram_array_, feature_index, real_fidx, true, // fixme: this cannot work with colsample_bynode, but do we really need to compute all features? why not just the used features of `split->feature`? num_data, &leaf_splits, &bests[tid]); From 2a437ed4443cd889d1957e4571988a1e217eefa9 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 3 Mar 2020 10:08:19 +0000 Subject: [PATCH 15/47] Bug fix. --- src/io/config.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/io/config.cpp b/src/io/config.cpp index fd2856ede94a..90bddb90914c 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -320,7 +320,7 @@ void Config::CheckParamConflict() { if (is_parallel && monotone_constraints_method == std::string("intermediate")) { // In distributed mode, local node doesn't have histograms on all features, cannot perform "intermediate" monotone constraints. Log::Warning("cannot use \"intermediate\" monotone constraints in parallel learning, auto set to \"basic\" method."); - monotone_constraints_method == "basic"; + monotone_constraints_method = "basic"; } } From b68e70d93ddd3d0a7fca5bb4335742745ddeeb22 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 3 Mar 2020 10:08:39 +0000 Subject: [PATCH 16/47] Disable the use of intermediate monotone constraints and feature sampling at the same time. --- src/io/config.cpp | 6 ++++++ src/treelearner/serial_tree_learner.cpp | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/io/config.cpp b/src/io/config.cpp index 90bddb90914c..051e3ba0006a 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -322,6 +322,12 @@ void Config::CheckParamConflict() { Log::Warning("cannot use \"intermediate\" monotone constraints in parallel learning, auto set to \"basic\" method."); monotone_constraints_method = "basic"; } + if (feature_fraction_bynode != 1.0 && monotone_constraints_method == std::string("intermediate")) { + // "intermediate" monotone constraints need to recompute splits. If the features are sampled when computing the + // split initially, then the sampling needs to be recorded or done once again, which is currently not supported + Log::Warning("cannot use \"intermediate\" monotone constraints with feature fraction different from 1, auto set monotone constraints to \"basic\" method."); + monotone_constraints_method = "basic"; + } } std::string Config::ToString() const { diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index f8f7dd57c331..5d2a5dd51dd8 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -726,7 +726,7 @@ void SerialTreeLearner::RecomputeBestSplitForLeaf(int leaf, SplitInfo* split) { int real_fidx = train_data_->RealFeatureIndex(feature_index); ComputeBestSplitForFeature( histogram_array_, feature_index, real_fidx, - true, // fixme: this cannot work with colsample_bynode, but do we really need to compute all features? why not just the used features of `split->feature`? + true, num_data, &leaf_splits, &bests[tid]); OMP_LOOP_EX_END(); From 3757500391fd694f20f1b79d7b1f95b897a70224 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 3 Mar 2020 10:20:04 +0000 Subject: [PATCH 17/47] Added an alias for monotone constraining method. --- docs/Parameters.rst | 2 +- include/LightGBM/config.h | 2 +- src/io/config_auto.cpp | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 101b76c28e17..08fd07b0781a 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -376,7 +376,7 @@ Learning Control Parameters - dropout rate: a fraction of previous trees to drop during the dropout -- ``monotone_constraints_method`` :raw-html:`🔗︎`, default = ``basic``, type = string, aliases: ``monotone_constraining_method`` +- ``monotone_constraints_method`` :raw-html:`🔗︎`, default = ``basic``, type = string, aliases: ``monotone_constraining_method``, ``mc_method`` - used only if ``monotone_constraints`` is set diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index b2cab634e6a7..da563581bf03 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -364,7 +364,7 @@ struct Config { // desc = dropout rate: a fraction of previous trees to drop during the dropout double drop_rate = 0.1; - // alias = monotone_constraining_method + // alias = monotone_constraining_method, mc_method // desc = used only if ``monotone_constraints`` is set // desc = monotone constraints method: if set to "basic", the most basic monotone constraints method will be used. It does not slow the library at all, but over-constrains the predictions. If set to "intermediate", a more advanced method will be used, which may slow the library very slightly. However, the intermediate method is much less constraining than the basic method and should significantly improve the results. std::string monotone_constraints_method = "basic"; diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index bdef7a569d2c..ece6c7ba0bff 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -83,6 +83,7 @@ const std::unordered_map& Config::alias_table() { {"min_split_gain", "min_gain_to_split"}, {"rate_drop", "drop_rate"}, {"monotone_constraining_method", "monotone_constraints_method"}, + {"mc_method", "monotone_constraints_method"}, {"topk", "top_k"}, {"mc", "monotone_constraints"}, {"monotone_constraint", "monotone_constraints"}, From ecc04a1bf71e0866e796395dce2529a7907da7a3 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 3 Mar 2020 10:31:49 +0000 Subject: [PATCH 18/47] Use the right variable to get the number of threads. --- src/treelearner/serial_tree_learner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 5d2a5dd51dd8..84f87d8a63de 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -710,7 +710,7 @@ void SerialTreeLearner::RecomputeBestSplitForLeaf(int leaf, SplitInfo* split) { double sum_hessians = split->left_sum_hessian + split->right_sum_hessian; int num_data = split->left_count + split->right_count; - std::vector bests(num_threads_); + std::vector bests(share_state_->num_threads); LeafSplits leaf_splits(num_data); leaf_splits.Init(leaf, sum_gradients, sum_hessians); From dd012fc124e47d7c87bcf5d794d073be6be27c7a Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 3 Mar 2020 10:54:26 +0000 Subject: [PATCH 19/47] Fix DEBUG checks. --- src/treelearner/monotone_constraints.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 1e2fad5b2da7..62da81389748 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -126,7 +126,7 @@ class FastLeafConstraints : public BasicLeafConstraints { } #ifdef DEBUG CHECK(new_leaf - 1 >= 0); - CHECK(new_leaf - 1 < node_parent_.size()); + CHECK((unsigned int)(new_leaf - 1) < node_parent_.size()); #endif node_parent_[new_leaf - 1] = tree->leaf_parent(leaf); } @@ -168,7 +168,7 @@ class FastLeafConstraints : public BasicLeafConstraints { const std::vector &best_split_per_leaf) { #ifdef DEBUG CHECK(node_idx >= 0); - CHECK(node_idx < node_parent_.size()); + CHECK((unsigned int) node_idx < node_parent_.size()); #endif int parent_idx = node_parent_[node_idx]; if (parent_idx != -1) { @@ -252,10 +252,10 @@ class FastLeafConstraints : public BasicLeafConstraints { #ifdef DEBUG if (maximum) { CHECK(min_max_constraints.first >= - learner_state.tree->LeafOutput(leaf_idx)); + tree->LeafOutput(leaf_idx)); } else { CHECK(min_max_constraints.second <= - learner_state.tree->LeafOutput(leaf_idx)); + tree->LeafOutput(leaf_idx)); } #endif From 57fe30a6f28550a1e367d9659403cc4a51daf140 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 3 Mar 2020 11:12:56 +0000 Subject: [PATCH 20/47] Add back check to determine if histogram is splittable. --- src/treelearner/serial_tree_learner.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 84f87d8a63de..4d619003ea28 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -722,6 +722,9 @@ void SerialTreeLearner::RecomputeBestSplitForLeaf(int leaf, SplitInfo* split) { if (!is_feature_used_[feature_index]) { continue; } + if (!histogram_array_[feature_index].is_splittable()) { + continue; + } const int tid = omp_get_thread_num(); int real_fidx = train_data_->RealFeatureIndex(feature_index); ComputeBestSplitForFeature( From 421a7a0456c1398d91e94ffe39ae75e78607a84a Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 3 Mar 2020 11:40:37 +0000 Subject: [PATCH 21/47] Added forgotten override keywords. --- src/treelearner/monotone_constraints.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 62da81389748..52058dec1e97 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -95,7 +95,7 @@ class BasicLeafConstraints : public LeafConstraintsBase { return std::vector(); } - const ConstraintEntry &Get(int leaf_idx) const { return entries_[leaf_idx]; } + const ConstraintEntry &Get(int leaf_idx) const override { return entries_[leaf_idx]; } protected: int num_leaves_; @@ -150,7 +150,7 @@ class FastLeafConstraints : public BasicLeafConstraints { int new_leaf, int8_t monotone_type, double right_output, double left_output, int split_feature, const SplitInfo &split_info, - const std::vector &best_split_per_leaf) { + const std::vector &best_split_per_leaf) override{ leaves_to_update_.clear(); UpdateConstraintsWithOutputs(is_numerical_split, leaf, new_leaf, monotone_type, right_output, left_output); From eb6872091e7e4e1e757f4ae13153c3edad5372c7 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 3 Mar 2020 12:32:04 +0000 Subject: [PATCH 22/47] Perform monotone constraint update only when necessary. --- src/treelearner/monotone_constraints.hpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 52058dec1e97..e6902be74a67 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -152,12 +152,14 @@ class FastLeafConstraints : public BasicLeafConstraints { int split_feature, const SplitInfo &split_info, const std::vector &best_split_per_leaf) override{ leaves_to_update_.clear(); - UpdateConstraintsWithOutputs(is_numerical_split, leaf, new_leaf, - monotone_type, right_output, left_output); + if (leaf_is_in_monotone_subtree_[leaf]) { + UpdateConstraintsWithOutputs(is_numerical_split, leaf, new_leaf, + monotone_type, right_output, left_output); - GoUpToFindLeavesToUpdate(tree, tree->leaf_parent(new_leaf), split_feature, - split_info, split_info.threshold, - best_split_per_leaf); + GoUpToFindLeavesToUpdate(tree, tree->leaf_parent(new_leaf), split_feature, + split_info, split_info.threshold, + best_split_per_leaf); + } return leaves_to_update_; } From a530020f0ed9792e30cb12c26479f83f58b08b4e Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 3 Mar 2020 13:19:55 +0000 Subject: [PATCH 23/47] Small refactor of FastLeafConstraints. --- src/treelearner/monotone_constraints.hpp | 201 +++++++++++++++-------- 1 file changed, 131 insertions(+), 70 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index e6902be74a67..af96aff03537 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -163,71 +163,115 @@ class FastLeafConstraints : public BasicLeafConstraints { return leaves_to_update_; } + bool OppositeChildShouldBeUpdated( + bool is_split_numerical, + const std::vector &features_of_splits_going_up_from_original_leaf, + int inner_feature, + std::vector &was_original_leaf_right_child_of_split, + bool is_in_right_child) { + bool opposite_child_should_be_updated = true; + + // if the split is categorical, it is not handled by this optimisation, + // so the code will have to go down in the other child subtree to see if + // there are leaves to update + // even though it may sometimes be unnecessary + if (is_split_numerical) { + // only branches containing leaves that are contiguous to the original + // leaf need to be updated + // therefore, for the same feature, there is no use going down from the + // second time going up on the right (or on the left) + for (unsigned int split_idx = 0; + split_idx < features_of_splits_going_up_from_original_leaf.size(); + ++split_idx) { + if (features_of_splits_going_up_from_original_leaf[split_idx] == + inner_feature && + (was_original_leaf_right_child_of_split[split_idx] == + is_in_right_child)) { + opposite_child_should_be_updated = false; + break; + } + } + } + return opposite_child_should_be_updated; + } + + // Recursive function that goes up the tree, and then down to find leaves that + // have constraints to be updated void GoUpToFindLeavesToUpdate( - const Tree *tree, int node_idx, std::vector &features, - std::vector &thresholds, std::vector &is_in_right_split, + const Tree *tree, int node_idx, + std::vector &features_of_splits_going_up_from_original_leaf, + std::vector &thresholds_of_splits_going_up_from_original_leaf, + std::vector &was_original_leaf_right_child_of_split, int split_feature, const SplitInfo &split_info, uint32_t split_threshold, const std::vector &best_split_per_leaf) { #ifdef DEBUG CHECK(node_idx >= 0); - CHECK((unsigned int) node_idx < node_parent_.size()); + CHECK((unsigned int)node_idx < node_parent_.size()); #endif int parent_idx = node_parent_[node_idx]; if (parent_idx != -1) { int inner_feature = tree->split_feature_inner(parent_idx); int feature = tree->split_feature(parent_idx); int8_t monotone_type = config_->monotone_constraints[feature]; - bool is_right_split = tree->right_child(parent_idx) == node_idx; - bool split_contains_new_information = true; + bool is_in_right_child = tree->right_child(parent_idx) == node_idx; bool is_split_numerical = tree->IsNumericalSplit(node_idx); - // only branches containing leaves that are contiguous to the original - // leaf need to be updated - if (is_split_numerical) { - for (unsigned int i = 0; i < features.size(); ++i) { - if (features[i] == inner_feature && - (is_in_right_split[i] == is_right_split)) { - split_contains_new_information = false; - break; - } - } - } + // this is just an optimisation not to waste time going down in subtrees + // where there won't be any leaf to update + bool opposite_child_should_be_updated = OppositeChildShouldBeUpdated( + is_split_numerical, features_of_splits_going_up_from_original_leaf, + inner_feature, was_original_leaf_right_child_of_split, + is_in_right_child); - if (split_contains_new_information) { + if (opposite_child_should_be_updated) { if (monotone_type != 0) { + // these variables correspond to the current split we encounter going + // up the tree int left_child_idx = tree->left_child(parent_idx); int right_child_idx = tree->right_child(parent_idx); bool left_child_is_curr_idx = (left_child_idx == node_idx); - int node_idx_to_pass = + int opposite_child_idx = (left_child_is_curr_idx) ? right_child_idx : left_child_idx; - bool take_min = (monotone_type < 0) ? left_child_is_curr_idx - : !left_child_is_curr_idx; - - GoDownToFindLeavesToUpdate(tree, node_idx_to_pass, features, - thresholds, is_in_right_split, take_min, - split_feature, split_info, true, true, - split_threshold, best_split_per_leaf); + bool update_max_constraints_in_opposite_child_leaves = + (monotone_type < 0) ? left_child_is_curr_idx + : !left_child_is_curr_idx; + + GoDownToFindLeavesToUpdate( + tree, opposite_child_idx, + features_of_splits_going_up_from_original_leaf, + thresholds_of_splits_going_up_from_original_leaf, + was_original_leaf_right_child_of_split, + update_max_constraints_in_opposite_child_leaves, split_feature, + split_info, true, true, split_threshold, best_split_per_leaf); } - is_in_right_split.push_back(tree->right_child(parent_idx) == node_idx); - thresholds.push_back(tree->threshold_in_bin(parent_idx)); - features.push_back(tree->split_feature_inner(parent_idx)); + was_original_leaf_right_child_of_split.push_back( + tree->right_child(parent_idx) == node_idx); + thresholds_of_splits_going_up_from_original_leaf.push_back( + tree->threshold_in_bin(parent_idx)); + features_of_splits_going_up_from_original_leaf.push_back( + tree->split_feature_inner(parent_idx)); } if (parent_idx != 0) { - GoUpToFindLeavesToUpdate(tree, parent_idx, features, thresholds, - is_in_right_split, split_feature, split_info, - split_threshold, best_split_per_leaf); + GoUpToFindLeavesToUpdate( + tree, parent_idx, features_of_splits_going_up_from_original_leaf, + thresholds_of_splits_going_up_from_original_leaf, + was_original_leaf_right_child_of_split, split_feature, split_info, + split_threshold, best_split_per_leaf); } } } void GoDownToFindLeavesToUpdate( - const Tree *tree, int node_idx, const std::vector &features, - const std::vector &thresholds, - const std::vector &is_in_right_split, int maximum, - int split_feature, const SplitInfo &split_info, bool use_left_leaf, - bool use_right_leaf, uint32_t split_threshold, + const Tree *tree, int node_idx, + const std::vector &features_of_splits_going_up_from_original_leaf, + const std::vector & + thresholds_of_splits_going_up_from_original_leaf, + const std::vector &was_original_leaf_right_child_of_split, + bool update_max_constraints, int split_feature, + const SplitInfo &split_info, bool use_left_leaf, bool use_right_leaf, + uint32_t split_threshold, const std::vector &best_split_per_leaf) { if (node_idx < 0) { int leaf_idx = ~node_idx; @@ -252,16 +296,14 @@ class FastLeafConstraints : public BasicLeafConstraints { } #ifdef DEBUG - if (maximum) { - CHECK(min_max_constraints.first >= - tree->LeafOutput(leaf_idx)); + if (update_max_constraints) { + CHECK(min_max_constraints.first >= tree->LeafOutput(leaf_idx)); } else { - CHECK(min_max_constraints.second <= - tree->LeafOutput(leaf_idx)); + CHECK(min_max_constraints.second <= tree->LeafOutput(leaf_idx)); } #endif - if (!maximum) { + if (!update_max_constraints) { something_changed = entries_[leaf_idx].UpdateMinAndReturnBoolIfChanged( min_max_constraints.second); } else { @@ -276,7 +318,9 @@ class FastLeafConstraints : public BasicLeafConstraints { } else { // check if the children are contiguous with the original leaf std::pair keep_going_left_right = ShouldKeepGoingLeftRight( - tree, node_idx, features, thresholds, is_in_right_split); + tree, node_idx, features_of_splits_going_up_from_original_leaf, + thresholds_of_splits_going_up_from_original_leaf, + was_original_leaf_right_child_of_split); int inner_feature = tree->split_feature_inner(node_idx); uint32_t threshold = tree->threshold_in_bin(node_idx); bool is_split_numerical = tree->IsNumericalSplit(node_idx); @@ -292,45 +336,56 @@ class FastLeafConstraints : public BasicLeafConstraints { } if (keep_going_left_right.first) { - GoDownToFindLeavesToUpdate(tree, tree->left_child(node_idx), features, - thresholds, is_in_right_split, maximum, - split_feature, split_info, use_left_leaf, - use_right_leaf_for_update && use_right_leaf, - split_threshold, best_split_per_leaf); + GoDownToFindLeavesToUpdate( + tree, tree->left_child(node_idx), + features_of_splits_going_up_from_original_leaf, + thresholds_of_splits_going_up_from_original_leaf, + was_original_leaf_right_child_of_split, update_max_constraints, + split_feature, split_info, use_left_leaf, + use_right_leaf_for_update && use_right_leaf, split_threshold, + best_split_per_leaf); } if (keep_going_left_right.second) { GoDownToFindLeavesToUpdate( - tree, tree->right_child(node_idx), features, thresholds, - is_in_right_split, maximum, split_feature, split_info, + tree, tree->right_child(node_idx), + features_of_splits_going_up_from_original_leaf, + thresholds_of_splits_going_up_from_original_leaf, + was_original_leaf_right_child_of_split, update_max_constraints, + split_feature, split_info, use_left_leaf_for_update && use_left_leaf, use_right_leaf, split_threshold, best_split_per_leaf); } } } - void GoUpToFindLeavesToUpdate( - const Tree *tree, int node_idx, int split_feature, - const SplitInfo &split_info, uint32_t split_threshold, - const std::vector &best_split_per_leaf) { + void + GoUpToFindLeavesToUpdate(const Tree *tree, int node_idx, int split_feature, + const SplitInfo &split_info, + uint32_t split_threshold, + const std::vector &best_split_per_leaf) { int depth = tree->leaf_depth(~tree->left_child(node_idx)) - 1; - std::vector features; - std::vector thresholds; - std::vector is_in_right_split; + std::vector features_of_splits_going_up_from_original_leaf; + std::vector thresholds_of_splits_going_up_from_original_leaf; + std::vector was_original_leaf_right_child_of_split; - features.reserve(depth); - thresholds.reserve(depth); - is_in_right_split.reserve(depth); + features_of_splits_going_up_from_original_leaf.reserve(depth); + thresholds_of_splits_going_up_from_original_leaf.reserve(depth); + was_original_leaf_right_child_of_split.reserve(depth); - GoUpToFindLeavesToUpdate(tree, node_idx, features, thresholds, - is_in_right_split, split_feature, split_info, - split_threshold, best_split_per_leaf); + GoUpToFindLeavesToUpdate( + tree, node_idx, features_of_splits_going_up_from_original_leaf, + thresholds_of_splits_going_up_from_original_leaf, + was_original_leaf_right_child_of_split, split_feature, split_info, + split_threshold, best_split_per_leaf); } std::pair ShouldKeepGoingLeftRight( - const Tree *tree, int node_idx, const std::vector &features, - const std::vector &thresholds, - const std::vector &is_in_right_split) { + const Tree *tree, int node_idx, + const std::vector &features_of_splits_going_up_from_original_leaf, + const std::vector & + thresholds_of_splits_going_up_from_original_leaf, + const std::vector &was_original_leaf_right_child_of_split) { int inner_feature = tree->split_feature_inner(node_idx); uint32_t threshold = tree->threshold_in_bin(node_idx); bool is_split_numerical = tree->IsNumericalSplit(node_idx); @@ -341,12 +396,18 @@ class FastLeafConstraints : public BasicLeafConstraints { // the original leaf if so the algorithm should keep going down these nodes // to update constraints if (is_split_numerical) { - for (unsigned int i = 0; i < features.size(); ++i) { - if (features[i] == inner_feature) { - if (threshold >= thresholds[i] && !is_in_right_split[i]) { + for (unsigned int i = 0; + i < features_of_splits_going_up_from_original_leaf.size(); ++i) { + if (features_of_splits_going_up_from_original_leaf[i] == + inner_feature) { + if (threshold >= + thresholds_of_splits_going_up_from_original_leaf[i] && + !was_original_leaf_right_child_of_split[i]) { keep_going_right = false; } - if (threshold <= thresholds[i] && is_in_right_split[i]) { + if (threshold <= + thresholds_of_splits_going_up_from_original_leaf[i] && + was_original_leaf_right_child_of_split[i]) { keep_going_left = false; } } From 448718399f76b85634d7470a7a03e6c3657e9c23 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 5 Mar 2020 09:23:00 +0000 Subject: [PATCH 24/47] Post rebase commit. --- include/LightGBM/tree.h | 12 ------------ src/treelearner/serial_tree_learner.cpp | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index f422cbce0d47..4d64d3b63681 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -134,18 +134,6 @@ class Tree { inline int PredictLeafIndex(const double* feature_values) const; inline int PredictLeafIndexByMap(const std::unordered_map& feature_values) const; -<<<<<<< HEAD -<<<<<<< HEAD -======= - // Get node parent - inline int node_parent(int node_idx) const; - // Get leaf parent - inline int leaf_parent(int node_idx) const; -======= ->>>>>>> more fixes - - ->>>>>>> Add util functions. inline void PredictContrib(const double* feature_values, int num_features, double* output); /*! \brief Get Number of leaves*/ diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 4d619003ea28..5ce31ed811d2 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -719,7 +719,7 @@ void SerialTreeLearner::RecomputeBestSplitForLeaf(int leaf, SplitInfo* split) { #pragma omp parallel for schedule(static) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { OMP_LOOP_EX_BEGIN(); - if (!is_feature_used_[feature_index]) { + if (!col_sampler_.is_feature_used_bytree()[feature_index]) { continue; } if (!histogram_array_[feature_index].is_splittable()) { From d1c73c3cf65b661f5c11092d61a0b2389e07396e Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 9 Mar 2020 08:18:23 +0000 Subject: [PATCH 25/47] Small refactor. --- include/LightGBM/tree.h | 3 - src/treelearner/monotone_constraints.hpp | 80 ++++++++++++------------ src/treelearner/serial_tree_learner.cpp | 8 +-- 3 files changed, 43 insertions(+), 48 deletions(-) diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 4d64d3b63681..92840e7800ec 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -166,9 +166,6 @@ class Tree { inline int leaf_parent(int leaf_idx) const { return leaf_parent_[leaf_idx]; } inline uint32_t threshold_in_bin(int node_idx) const { -#ifdef DEBUG - CHECK(node_idx >= 0); -#endif return threshold_in_bin_[node_idx]; } diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index af96aff03537..69922a6a16c2 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -49,17 +49,17 @@ struct ConstraintEntry { class LeafConstraintsBase { public: virtual ~LeafConstraintsBase(){}; - virtual const ConstraintEntry &Get(int leaf_idx) const = 0; + virtual const ConstraintEntry& Get(int leaf_idx) const = 0; virtual void Reset() = 0; - virtual void BeforeSplit(const Tree *tree, int leaf, int new_leaf, + virtual void BeforeSplit(const Tree* tree, int leaf, int new_leaf, int8_t monotone_type) = 0; virtual std::vector Update( - const Tree *tree, bool is_numerical_split, + const Tree* tree, bool is_numerical_split, int leaf, int new_leaf, int8_t monotone_type, double right_output, - double left_output, int split_feature, const SplitInfo &split_info, - const std::vector &best_split_per_leaf) = 0; + double left_output, int split_feature, const SplitInfo& split_info, + const std::vector& best_split_per_leaf) = 0; - inline static LeafConstraintsBase *Create(const Config *config, int num_leaves); + inline static LeafConstraintsBase* Create(const Config* config, int num_leaves); }; class BasicLeafConstraints : public LeafConstraintsBase { @@ -69,18 +69,18 @@ class BasicLeafConstraints : public LeafConstraintsBase { } void Reset() override { - for (auto &entry : entries_) { + for (auto& entry : entries_) { entry.Reset(); } } - void BeforeSplit(const Tree *, int, int, int8_t) override {} + void BeforeSplit(const Tree*, int, int, int8_t) override {} - std::vector Update(const Tree *, + std::vector Update(const Tree*, bool is_numerical_split, int leaf, int new_leaf, int8_t monotone_type, double right_output, - double left_output, int, const SplitInfo &, - const std::vector &) override { + double left_output, int, const SplitInfo& , + const std::vector& ) override { entries_[new_leaf] = entries_[leaf]; if (is_numerical_split) { double mid = (left_output + right_output) / 2.0f; @@ -95,7 +95,7 @@ class BasicLeafConstraints : public LeafConstraintsBase { return std::vector(); } - const ConstraintEntry &Get(int leaf_idx) const override { return entries_[leaf_idx]; } + const ConstraintEntry& Get(int leaf_idx) const override { return entries_[leaf_idx]; } protected: int num_leaves_; @@ -104,7 +104,7 @@ class BasicLeafConstraints : public LeafConstraintsBase { class FastLeafConstraints : public BasicLeafConstraints { public: - explicit FastLeafConstraints(const Config *config, int num_leaves) + explicit FastLeafConstraints(const Config* config, int num_leaves) : BasicLeafConstraints(num_leaves), config_(config) { leaf_is_in_monotone_subtree_.resize(num_leaves_, false); node_parent_.resize(num_leaves_ - 1, -1); @@ -118,7 +118,7 @@ class FastLeafConstraints : public BasicLeafConstraints { leaves_to_update_.clear(); } - void BeforeSplit(const Tree *tree, int leaf, int new_leaf, + void BeforeSplit(const Tree* tree, int leaf, int new_leaf, int8_t monotone_type) override { if (monotone_type != 0 || leaf_is_in_monotone_subtree_[leaf]) { leaf_is_in_monotone_subtree_[leaf] = true; @@ -146,11 +146,11 @@ class FastLeafConstraints : public BasicLeafConstraints { } } - std::vector Update(const Tree *tree, bool is_numerical_split, int leaf, + std::vector Update(const Tree* tree, bool is_numerical_split, int leaf, int new_leaf, int8_t monotone_type, double right_output, double left_output, - int split_feature, const SplitInfo &split_info, - const std::vector &best_split_per_leaf) override{ + int split_feature, const SplitInfo& split_info, + const std::vector& best_split_per_leaf) override{ leaves_to_update_.clear(); if (leaf_is_in_monotone_subtree_[leaf]) { UpdateConstraintsWithOutputs(is_numerical_split, leaf, new_leaf, @@ -165,9 +165,9 @@ class FastLeafConstraints : public BasicLeafConstraints { bool OppositeChildShouldBeUpdated( bool is_split_numerical, - const std::vector &features_of_splits_going_up_from_original_leaf, + const std::vector& features_of_splits_going_up_from_original_leaf, int inner_feature, - std::vector &was_original_leaf_right_child_of_split, + std::vector& was_original_leaf_right_child_of_split, bool is_in_right_child) { bool opposite_child_should_be_updated = true; @@ -198,12 +198,12 @@ class FastLeafConstraints : public BasicLeafConstraints { // Recursive function that goes up the tree, and then down to find leaves that // have constraints to be updated void GoUpToFindLeavesToUpdate( - const Tree *tree, int node_idx, - std::vector &features_of_splits_going_up_from_original_leaf, - std::vector &thresholds_of_splits_going_up_from_original_leaf, - std::vector &was_original_leaf_right_child_of_split, - int split_feature, const SplitInfo &split_info, uint32_t split_threshold, - const std::vector &best_split_per_leaf) { + const Tree* tree, int node_idx, + std::vector& features_of_splits_going_up_from_original_leaf, + std::vector& thresholds_of_splits_going_up_from_original_leaf, + std::vector& was_original_leaf_right_child_of_split, + int split_feature, const SplitInfo& split_info, uint32_t split_threshold, + const std::vector& best_split_per_leaf) { #ifdef DEBUG CHECK(node_idx >= 0); CHECK((unsigned int)node_idx < node_parent_.size()); @@ -264,15 +264,15 @@ class FastLeafConstraints : public BasicLeafConstraints { } void GoDownToFindLeavesToUpdate( - const Tree *tree, int node_idx, - const std::vector &features_of_splits_going_up_from_original_leaf, - const std::vector & + const Tree* tree, int node_idx, + const std::vector& features_of_splits_going_up_from_original_leaf, + const std::vector& thresholds_of_splits_going_up_from_original_leaf, - const std::vector &was_original_leaf_right_child_of_split, + const std::vector& was_original_leaf_right_child_of_split, bool update_max_constraints, int split_feature, - const SplitInfo &split_info, bool use_left_leaf, bool use_right_leaf, + const SplitInfo& split_info, bool use_left_leaf, bool use_right_leaf, uint32_t split_threshold, - const std::vector &best_split_per_leaf) { + const std::vector& best_split_per_leaf) { if (node_idx < 0) { int leaf_idx = ~node_idx; @@ -359,10 +359,10 @@ class FastLeafConstraints : public BasicLeafConstraints { } void - GoUpToFindLeavesToUpdate(const Tree *tree, int node_idx, int split_feature, - const SplitInfo &split_info, + GoUpToFindLeavesToUpdate(const Tree* tree, int node_idx, int split_feature, + const SplitInfo& split_info, uint32_t split_threshold, - const std::vector &best_split_per_leaf) { + const std::vector& best_split_per_leaf) { int depth = tree->leaf_depth(~tree->left_child(node_idx)) - 1; std::vector features_of_splits_going_up_from_original_leaf; @@ -381,11 +381,11 @@ class FastLeafConstraints : public BasicLeafConstraints { } std::pair ShouldKeepGoingLeftRight( - const Tree *tree, int node_idx, - const std::vector &features_of_splits_going_up_from_original_leaf, - const std::vector & + const Tree* tree, int node_idx, + const std::vector& features_of_splits_going_up_from_original_leaf, + const std::vector& thresholds_of_splits_going_up_from_original_leaf, - const std::vector &was_original_leaf_right_child_of_split) { + const std::vector& was_original_leaf_right_child_of_split) { int inner_feature = tree->split_feature_inner(node_idx); uint32_t threshold = tree->threshold_in_bin(node_idx); bool is_split_numerical = tree->IsNumericalSplit(node_idx); @@ -417,7 +417,7 @@ class FastLeafConstraints : public BasicLeafConstraints { } private: - const Config *config_; + const Config* config_; std::vector leaves_to_update_; // add parent node information std::vector node_parent_; @@ -425,7 +425,7 @@ class FastLeafConstraints : public BasicLeafConstraints { std::vector leaf_is_in_monotone_subtree_; }; -LeafConstraintsBase *LeafConstraintsBase::Create(const Config *config, +LeafConstraintsBase* LeafConstraintsBase::Create(const Config* config, int num_leaves) { if (config->monotone_constraints_method == "intermediate") { return new FastLeafConstraints(config, num_leaves); diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 5ce31ed811d2..aae84b5cf993 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -716,13 +716,11 @@ void SerialTreeLearner::RecomputeBestSplitForLeaf(int leaf, SplitInfo* split) { OMP_INIT_EX(); // find splits -#pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(static) num_threads(share_state_->num_threads) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { OMP_LOOP_EX_BEGIN(); - if (!col_sampler_.is_feature_used_bytree()[feature_index]) { - continue; - } - if (!histogram_array_[feature_index].is_splittable()) { + if (!col_sampler_.is_feature_used_bytree()[feature_index] | + !histogram_array_[feature_index].is_splittable()) { continue; } const int tid = omp_get_thread_num(); From 8122241292ad274e06d5d1f387dfcd6c4de3ecb8 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 9 Mar 2020 08:58:08 +0000 Subject: [PATCH 26/47] Typo. --- src/treelearner/serial_tree_learner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index aae84b5cf993..b97055fdfe77 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -719,7 +719,7 @@ void SerialTreeLearner::RecomputeBestSplitForLeaf(int leaf, SplitInfo* split) { #pragma omp parallel for schedule(static) num_threads(share_state_->num_threads) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { OMP_LOOP_EX_BEGIN(); - if (!col_sampler_.is_feature_used_bytree()[feature_index] | + if (!col_sampler_.is_feature_used_bytree()[feature_index] || !histogram_array_[feature_index].is_splittable()) { continue; } From cd4035044a84fafaa343e103aa6420ae5ee69d92 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 10 Mar 2020 12:44:00 +0000 Subject: [PATCH 27/47] Added comment and slightly improved logic of monotone constraints. --- src/treelearner/monotone_constraints.hpp | 58 ++++++++++++++++-------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 69922a6a16c2..b1d54dc0f90a 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -209,6 +209,7 @@ class FastLeafConstraints : public BasicLeafConstraints { CHECK((unsigned int)node_idx < node_parent_.size()); #endif int parent_idx = node_parent_[node_idx]; + // if not at the root if (parent_idx != -1) { int inner_feature = tree->split_feature_inner(parent_idx); int feature = tree->split_feature(parent_idx); @@ -224,6 +225,8 @@ class FastLeafConstraints : public BasicLeafConstraints { is_in_right_child); if (opposite_child_should_be_updated) { + // if there is no monotone constraint on a split, + // then there is no relationship between its left and right leaves' values if (monotone_type != 0) { // these variables correspond to the current split we encounter going // up the tree @@ -236,6 +239,9 @@ class FastLeafConstraints : public BasicLeafConstraints { (monotone_type < 0) ? left_child_is_curr_idx : !left_child_is_curr_idx; + // the opposite child needs to be updated + // so the code needs to go down in the the opposite child + // to see which leaves' constraints need to be updated GoDownToFindLeavesToUpdate( tree, opposite_child_idx, features_of_splits_going_up_from_original_leaf, @@ -245,6 +251,10 @@ class FastLeafConstraints : public BasicLeafConstraints { split_info, true, true, split_threshold, best_split_per_leaf); } + // if opposite_child_should_be_updated, then it means the path to come up there was relevant, + // i.e. that it will be helpful going down to determine which leaf + // is actually contiguous to the original 2 leaves and should be updated + // so the variables associated with the split need to be recorded was_original_leaf_right_child_of_split.push_back( tree->right_child(parent_idx) == node_idx); thresholds_of_splits_going_up_from_original_leaf.push_back( @@ -253,13 +263,12 @@ class FastLeafConstraints : public BasicLeafConstraints { tree->split_feature_inner(parent_idx)); } - if (parent_idx != 0) { - GoUpToFindLeavesToUpdate( - tree, parent_idx, features_of_splits_going_up_from_original_leaf, - thresholds_of_splits_going_up_from_original_leaf, - was_original_leaf_right_child_of_split, split_feature, split_info, - split_threshold, best_split_per_leaf); - } + // since current node is not the root, keep going up + GoUpToFindLeavesToUpdate( + tree, parent_idx, features_of_splits_going_up_from_original_leaf, + thresholds_of_splits_going_up_from_original_leaf, + was_original_leaf_right_child_of_split, split_feature, split_info, + split_threshold, best_split_per_leaf); } } @@ -273,17 +282,22 @@ class FastLeafConstraints : public BasicLeafConstraints { const SplitInfo& split_info, bool use_left_leaf, bool use_right_leaf, uint32_t split_threshold, const std::vector& best_split_per_leaf) { + // if leaf if (node_idx < 0) { int leaf_idx = ~node_idx; - // splits that are not to be used shall not be updated, included leaf at - // max depth + // splits that are not to be used shall not be updated, + // included leaf at max depth if (best_split_per_leaf[leaf_idx].gain == kMinScore) { return; } std::pair min_max_constraints; bool something_changed = false; + // if the current leaf is contiguous with both the new right leaf and the new left leaf + // then it may need to be greater than the max of the 2 or smaller than the min of the 2 + // otherwise, if the current leaf is contiguous with only one of the 2 new leaves, + // then it may need to be greater or smaller than it if (use_right_leaf && use_left_leaf) { min_max_constraints = std::minmax(split_info.right_output, split_info.left_output); @@ -302,7 +316,8 @@ class FastLeafConstraints : public BasicLeafConstraints { CHECK(min_max_constraints.second <= tree->LeafOutput(leaf_idx)); } #endif - + // depending on which split made the current leaf and the original leaves contiguous, + // either the min constraint or the max constraint of the current leaf need to be updated if (!update_max_constraints) { something_changed = entries_[leaf_idx].UpdateMinAndReturnBoolIfChanged( min_max_constraints.second); @@ -310,12 +325,13 @@ class FastLeafConstraints : public BasicLeafConstraints { something_changed = entries_[leaf_idx].UpdateMaxAndReturnBoolIfChanged( min_max_constraints.first); } + // If constraints were not updated, then there is no need to update the leaf if (!something_changed) { return; } leaves_to_update_.push_back(leaf_idx); - } else { + } else { // if node // check if the children are contiguous with the original leaf std::pair keep_going_left_right = ShouldKeepGoingLeftRight( tree, node_idx, features_of_splits_going_up_from_original_leaf, @@ -324,17 +340,22 @@ class FastLeafConstraints : public BasicLeafConstraints { int inner_feature = tree->split_feature_inner(node_idx); uint32_t threshold = tree->threshold_in_bin(node_idx); bool is_split_numerical = tree->IsNumericalSplit(node_idx); - bool use_left_leaf_for_update = true; - bool use_right_leaf_for_update = true; + bool use_left_leaf_for_update_right = true; + bool use_right_leaf_for_update_left = true; + // if the split is on the same feature (categorical variables not supported) + // then depending on the threshold, + // the current left child may not be contiguous with the original right leaf, + // or the current right child may not be contiguous with the original left leaf if (is_split_numerical && inner_feature == split_feature) { if (threshold >= split_threshold) { - use_left_leaf_for_update = false; + use_left_leaf_for_update_right = false; } if (threshold <= split_threshold) { - use_right_leaf_for_update = false; + use_right_leaf_for_update_left = false; } } + // go down left if (keep_going_left_right.first) { GoDownToFindLeavesToUpdate( tree, tree->left_child(node_idx), @@ -342,9 +363,10 @@ class FastLeafConstraints : public BasicLeafConstraints { thresholds_of_splits_going_up_from_original_leaf, was_original_leaf_right_child_of_split, update_max_constraints, split_feature, split_info, use_left_leaf, - use_right_leaf_for_update && use_right_leaf, split_threshold, + use_right_leaf_for_update_left && use_right_leaf, split_threshold, best_split_per_leaf); } + // go down right if (keep_going_left_right.second) { GoDownToFindLeavesToUpdate( tree, tree->right_child(node_idx), @@ -352,7 +374,7 @@ class FastLeafConstraints : public BasicLeafConstraints { thresholds_of_splits_going_up_from_original_leaf, was_original_leaf_right_child_of_split, update_max_constraints, split_feature, split_info, - use_left_leaf_for_update && use_left_leaf, use_right_leaf, + use_left_leaf_for_update_right && use_left_leaf, use_right_leaf, split_threshold, best_split_per_leaf); } } @@ -393,7 +415,7 @@ class FastLeafConstraints : public BasicLeafConstraints { bool keep_going_right = true; bool keep_going_left = true; // left and right nodes are checked to find out if they are contiguous with - // the original leaf if so the algorithm should keep going down these nodes + // the original leaves if so the algorithm should keep going down these nodes // to update constraints if (is_split_numerical) { for (unsigned int i = 0; From 5bf69f095e561964be39989dea261e37aa38216c Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 16 Mar 2020 11:15:39 +0000 Subject: [PATCH 28/47] Forgot a const. --- src/treelearner/monotone_constraints.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index b1d54dc0f90a..b683ec572fe1 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -167,7 +167,7 @@ class FastLeafConstraints : public BasicLeafConstraints { bool is_split_numerical, const std::vector& features_of_splits_going_up_from_original_leaf, int inner_feature, - std::vector& was_original_leaf_right_child_of_split, + const std::vector& was_original_leaf_right_child_of_split, bool is_in_right_child) { bool opposite_child_should_be_updated = true; From e72662337a2d4ea1b73317c4c0f896888e1ed408 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 16 Mar 2020 11:34:33 +0000 Subject: [PATCH 29/47] Vectors that are to be modified need to be pointers. --- src/treelearner/monotone_constraints.hpp | 34 ++++++++++++------------ 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index b683ec572fe1..dd6c1d50d049 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -199,9 +199,9 @@ class FastLeafConstraints : public BasicLeafConstraints { // have constraints to be updated void GoUpToFindLeavesToUpdate( const Tree* tree, int node_idx, - std::vector& features_of_splits_going_up_from_original_leaf, - std::vector& thresholds_of_splits_going_up_from_original_leaf, - std::vector& was_original_leaf_right_child_of_split, + std::vector* features_of_splits_going_up_from_original_leaf, + std::vector* thresholds_of_splits_going_up_from_original_leaf, + std::vector* was_original_leaf_right_child_of_split, int split_feature, const SplitInfo& split_info, uint32_t split_threshold, const std::vector& best_split_per_leaf) { #ifdef DEBUG @@ -220,8 +220,8 @@ class FastLeafConstraints : public BasicLeafConstraints { // this is just an optimisation not to waste time going down in subtrees // where there won't be any leaf to update bool opposite_child_should_be_updated = OppositeChildShouldBeUpdated( - is_split_numerical, features_of_splits_going_up_from_original_leaf, - inner_feature, was_original_leaf_right_child_of_split, + is_split_numerical, *features_of_splits_going_up_from_original_leaf, + inner_feature, *was_original_leaf_right_child_of_split, is_in_right_child); if (opposite_child_should_be_updated) { @@ -244,9 +244,9 @@ class FastLeafConstraints : public BasicLeafConstraints { // to see which leaves' constraints need to be updated GoDownToFindLeavesToUpdate( tree, opposite_child_idx, - features_of_splits_going_up_from_original_leaf, - thresholds_of_splits_going_up_from_original_leaf, - was_original_leaf_right_child_of_split, + *features_of_splits_going_up_from_original_leaf, + *thresholds_of_splits_going_up_from_original_leaf, + *was_original_leaf_right_child_of_split, update_max_constraints_in_opposite_child_leaves, split_feature, split_info, true, true, split_threshold, best_split_per_leaf); } @@ -255,11 +255,11 @@ class FastLeafConstraints : public BasicLeafConstraints { // i.e. that it will be helpful going down to determine which leaf // is actually contiguous to the original 2 leaves and should be updated // so the variables associated with the split need to be recorded - was_original_leaf_right_child_of_split.push_back( + was_original_leaf_right_child_of_split->push_back( tree->right_child(parent_idx) == node_idx); - thresholds_of_splits_going_up_from_original_leaf.push_back( + thresholds_of_splits_going_up_from_original_leaf->push_back( tree->threshold_in_bin(parent_idx)); - features_of_splits_going_up_from_original_leaf.push_back( + features_of_splits_going_up_from_original_leaf->push_back( tree->split_feature_inner(parent_idx)); } @@ -387,13 +387,13 @@ class FastLeafConstraints : public BasicLeafConstraints { const std::vector& best_split_per_leaf) { int depth = tree->leaf_depth(~tree->left_child(node_idx)) - 1; - std::vector features_of_splits_going_up_from_original_leaf; - std::vector thresholds_of_splits_going_up_from_original_leaf; - std::vector was_original_leaf_right_child_of_split; + std::vector* features_of_splits_going_up_from_original_leaf = new std::vector(); + std::vector* thresholds_of_splits_going_up_from_original_leaf = new std::vector(); + std::vector* was_original_leaf_right_child_of_split = new std::vector(); - features_of_splits_going_up_from_original_leaf.reserve(depth); - thresholds_of_splits_going_up_from_original_leaf.reserve(depth); - was_original_leaf_right_child_of_split.reserve(depth); + features_of_splits_going_up_from_original_leaf->reserve(depth); + thresholds_of_splits_going_up_from_original_leaf->reserve(depth); + was_original_leaf_right_child_of_split->reserve(depth); GoUpToFindLeavesToUpdate( tree, node_idx, features_of_splits_going_up_from_original_leaf, From bfd17475785de0b802e24bac86e1d9fa48cff075 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 16 Mar 2020 11:36:33 +0000 Subject: [PATCH 30/47] Rename FastLeafConstraints to IntermediateLeafConstraints to match documentation. --- src/treelearner/monotone_constraints.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index dd6c1d50d049..86d723c1ba4f 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -102,9 +102,9 @@ class BasicLeafConstraints : public LeafConstraintsBase { std::vector entries_; }; -class FastLeafConstraints : public BasicLeafConstraints { +class IntermediateLeafConstraints : public BasicLeafConstraints { public: - explicit FastLeafConstraints(const Config* config, int num_leaves) + explicit IntermediateLeafConstraints(const Config* config, int num_leaves) : BasicLeafConstraints(num_leaves), config_(config) { leaf_is_in_monotone_subtree_.resize(num_leaves_, false); node_parent_.resize(num_leaves_ - 1, -1); @@ -450,7 +450,7 @@ class FastLeafConstraints : public BasicLeafConstraints { LeafConstraintsBase* LeafConstraintsBase::Create(const Config* config, int num_leaves) { if (config->monotone_constraints_method == "intermediate") { - return new FastLeafConstraints(config, num_leaves); + return new IntermediateLeafConstraints(config, num_leaves); } return new BasicLeafConstraints(num_leaves); } From f45f0ec077c676c6dd198d5018a9f0f51bdf1c5b Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 16 Mar 2020 11:54:06 +0000 Subject: [PATCH 31/47] Remove overload of GoUpToFindLeavesToUpdate. --- src/treelearner/monotone_constraints.hpp | 43 +++++++++++------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 86d723c1ba4f..f0ac75d22f79 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -156,8 +156,25 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { UpdateConstraintsWithOutputs(is_numerical_split, leaf, new_leaf, monotone_type, right_output, left_output); - GoUpToFindLeavesToUpdate(tree, tree->leaf_parent(new_leaf), split_feature, - split_info, split_info.threshold, + // Initialize variables to store information while going up the tree + int depth = tree->leaf_depth(new_leaf) - 1; + + std::vector *features_of_splits_going_up_from_original_leaf = + new std::vector(); + std::vector *thresholds_of_splits_going_up_from_original_leaf = + new std::vector(); + std::vector *was_original_leaf_right_child_of_split = + new std::vector(); + + features_of_splits_going_up_from_original_leaf->reserve(depth); + thresholds_of_splits_going_up_from_original_leaf->reserve(depth); + was_original_leaf_right_child_of_split->reserve(depth); + + GoUpToFindLeavesToUpdate(tree, tree->leaf_parent(new_leaf), + features_of_splits_going_up_from_original_leaf, + thresholds_of_splits_going_up_from_original_leaf, + was_original_leaf_right_child_of_split, + split_feature, split_info, split_info.threshold, best_split_per_leaf); } return leaves_to_update_; @@ -380,28 +397,6 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { } } - void - GoUpToFindLeavesToUpdate(const Tree* tree, int node_idx, int split_feature, - const SplitInfo& split_info, - uint32_t split_threshold, - const std::vector& best_split_per_leaf) { - int depth = tree->leaf_depth(~tree->left_child(node_idx)) - 1; - - std::vector* features_of_splits_going_up_from_original_leaf = new std::vector(); - std::vector* thresholds_of_splits_going_up_from_original_leaf = new std::vector(); - std::vector* was_original_leaf_right_child_of_split = new std::vector(); - - features_of_splits_going_up_from_original_leaf->reserve(depth); - thresholds_of_splits_going_up_from_original_leaf->reserve(depth); - was_original_leaf_right_child_of_split->reserve(depth); - - GoUpToFindLeavesToUpdate( - tree, node_idx, features_of_splits_going_up_from_original_leaf, - thresholds_of_splits_going_up_from_original_leaf, - was_original_leaf_right_child_of_split, split_feature, split_info, - split_threshold, best_split_per_leaf); - } - std::pair ShouldKeepGoingLeftRight( const Tree* tree, int node_idx, const std::vector& features_of_splits_going_up_from_original_leaf, From 745dc449aca6ef877e8655d49525091e23c70c24 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 16 Mar 2020 11:58:08 +0000 Subject: [PATCH 32/47] Stop memory leaking. --- src/treelearner/monotone_constraints.hpp | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index f0ac75d22f79..50aff7932f60 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -159,21 +159,18 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { // Initialize variables to store information while going up the tree int depth = tree->leaf_depth(new_leaf) - 1; - std::vector *features_of_splits_going_up_from_original_leaf = - new std::vector(); - std::vector *thresholds_of_splits_going_up_from_original_leaf = - new std::vector(); - std::vector *was_original_leaf_right_child_of_split = - new std::vector(); + std::vector features_of_splits_going_up_from_original_leaf; + std::vector thresholds_of_splits_going_up_from_original_leaf; + std::vector was_original_leaf_right_child_of_split; - features_of_splits_going_up_from_original_leaf->reserve(depth); - thresholds_of_splits_going_up_from_original_leaf->reserve(depth); - was_original_leaf_right_child_of_split->reserve(depth); + features_of_splits_going_up_from_original_leaf.reserve(depth); + thresholds_of_splits_going_up_from_original_leaf.reserve(depth); + was_original_leaf_right_child_of_split.reserve(depth); GoUpToFindLeavesToUpdate(tree, tree->leaf_parent(new_leaf), - features_of_splits_going_up_from_original_leaf, - thresholds_of_splits_going_up_from_original_leaf, - was_original_leaf_right_child_of_split, + &features_of_splits_going_up_from_original_leaf, + &thresholds_of_splits_going_up_from_original_leaf, + &was_original_leaf_right_child_of_split, split_feature, split_info, split_info.threshold, best_split_per_leaf); } From cf417d3b2147f6b97e7d304c3ccd840144602fb3 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 17 Mar 2020 08:08:47 +0000 Subject: [PATCH 33/47] Fix cpplint issues. --- src/treelearner/monotone_constraints.hpp | 13 +++++++------ src/treelearner/serial_tree_learner.cpp | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 50aff7932f60..622c404e0c9b 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "split_info.hpp" @@ -48,7 +49,7 @@ struct ConstraintEntry { class LeafConstraintsBase { public: - virtual ~LeafConstraintsBase(){}; + virtual ~LeafConstraintsBase(){} virtual const ConstraintEntry& Get(int leaf_idx) const = 0; virtual void Reset() = 0; virtual void BeforeSplit(const Tree* tree, int leaf, int new_leaf, @@ -80,7 +81,7 @@ class BasicLeafConstraints : public LeafConstraintsBase { bool is_numerical_split, int leaf, int new_leaf, int8_t monotone_type, double right_output, double left_output, int, const SplitInfo& , - const std::vector& ) override { + const std::vector&) override { entries_[new_leaf] = entries_[leaf]; if (is_numerical_split) { double mid = (left_output + right_output) / 2.0f; @@ -150,7 +151,7 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { int new_leaf, int8_t monotone_type, double right_output, double left_output, int split_feature, const SplitInfo& split_info, - const std::vector& best_split_per_leaf) override{ + const std::vector& best_split_per_leaf) override { leaves_to_update_.clear(); if (leaf_is_in_monotone_subtree_[leaf]) { UpdateConstraintsWithOutputs(is_numerical_split, leaf, new_leaf, @@ -219,8 +220,8 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { int split_feature, const SplitInfo& split_info, uint32_t split_threshold, const std::vector& best_split_per_leaf) { #ifdef DEBUG - CHECK(node_idx >= 0); - CHECK((unsigned int)node_idx < node_parent_.size()); + CHECK_GE(node_idx >= 0); + CHECK_GE((unsigned int)node_idx < node_parent_.size()); #endif int parent_idx = node_parent_[node_idx]; // if not at the root @@ -345,7 +346,7 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { } leaves_to_update_.push_back(leaf_idx); - } else { // if node + } else { // if node // check if the children are contiguous with the original leaf std::pair keep_going_left_right = ShouldKeepGoingLeftRight( tree, node_idx, features_of_splits_going_up_from_original_leaf, diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index b97055fdfe77..94c6a1553d3f 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -630,7 +630,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, best_split_info.left_output, inner_feature_index, best_split_info, best_split_per_leaf_); // update leave outputs if needed - for (auto leaf: leaves_need_update) { + for (auto leaf : leaves_need_update) { RecomputeBestSplitForLeaf(leaf, &best_split_per_leaf_[leaf]); } } From fd34998875b63f3f059902bc847654e4483bb729 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 17 Mar 2020 08:34:44 +0000 Subject: [PATCH 34/47] Fix checks. --- src/treelearner/monotone_constraints.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 622c404e0c9b..d8a990ba4b50 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -220,8 +220,8 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { int split_feature, const SplitInfo& split_info, uint32_t split_threshold, const std::vector& best_split_per_leaf) { #ifdef DEBUG - CHECK_GE(node_idx >= 0); - CHECK_GE((unsigned int)node_idx < node_parent_.size()); + CHECK_GE(node_idx, 0); + CHECK_LT((unsigned int)node_idx, node_parent_.size()); #endif int parent_idx = node_parent_[node_idx]; // if not at the root From 2d935f3c549c28d81f245a47699552d0bb3d6363 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 17 Mar 2020 09:08:02 +0000 Subject: [PATCH 35/47] Fix more cpplint issues. --- src/treelearner/monotone_constraints.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index d8a990ba4b50..82cdad771ad1 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -49,7 +49,7 @@ struct ConstraintEntry { class LeafConstraintsBase { public: - virtual ~LeafConstraintsBase(){} + virtual ~LeafConstraintsBase() {} virtual const ConstraintEntry& Get(int leaf_idx) const = 0; virtual void Reset() = 0; virtual void BeforeSplit(const Tree* tree, int leaf, int new_leaf, @@ -126,8 +126,8 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { leaf_is_in_monotone_subtree_[new_leaf] = true; } #ifdef DEBUG - CHECK(new_leaf - 1 >= 0); - CHECK((unsigned int)(new_leaf - 1) < node_parent_.size()); + CHECK_GE(new_leaf - 1, 0); + CHECK_LT((unsigned int)(new_leaf - 1), node_parent_.size()); #endif node_parent_[new_leaf - 1] = tree->leaf_parent(leaf); } @@ -326,9 +326,9 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { #ifdef DEBUG if (update_max_constraints) { - CHECK(min_max_constraints.first >= tree->LeafOutput(leaf_idx)); + CHECK_GE(min_max_constraints.first, tree->LeafOutput(leaf_idx)); } else { - CHECK(min_max_constraints.second <= tree->LeafOutput(leaf_idx)); + CHECK_LE(min_max_constraints.second, tree->LeafOutput(leaf_idx)); } #endif // depending on which split made the current leaf and the original leaves contiguous, From 410c1e77164714985eb6dacd1010b31d6988b3aa Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 19 Mar 2020 09:18:40 +0000 Subject: [PATCH 36/47] Refactor config monotone constraints method. --- include/LightGBM/config.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index da563581bf03..ffd48ef55495 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -364,11 +364,6 @@ struct Config { // desc = dropout rate: a fraction of previous trees to drop during the dropout double drop_rate = 0.1; - // alias = monotone_constraining_method, mc_method - // desc = used only if ``monotone_constraints`` is set - // desc = monotone constraints method: if set to "basic", the most basic monotone constraints method will be used. It does not slow the library at all, but over-constrains the predictions. If set to "intermediate", a more advanced method will be used, which may slow the library very slightly. However, the intermediate method is much less constraining than the basic method and should significantly improve the results. - std::string monotone_constraints_method = "basic"; - // desc = used only in ``dart`` // desc = max number of dropped trees during one boosting iteration // desc = ``<=0`` means no limit @@ -441,6 +436,13 @@ struct Config { // desc = you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature std::vector monotone_constraints; + // alias = monotone_constraining_method, mc_method + // desc = used only if ``monotone_constraints`` is set + // desc = monotone constraints method + // descl2 = ``basic``, the most basic monotone constraints method. It does not slow the library at all, but over-constrains the predictions + // descl2 = ``intermediate``, a `more advanced method `__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results + std::string monotone_constraints_method = "basic"; + // type = multi-double // alias = feature_contrib, fc, fp, feature_penalty // default = None From e8fd2707ccac00c1eb2ed84e48332e41003da130 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 19 Mar 2020 09:43:10 +0000 Subject: [PATCH 37/47] Typos. --- src/io/config.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/io/config.cpp b/src/io/config.cpp index 051e3ba0006a..0cf1d3c8bf21 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -319,13 +319,13 @@ void Config::CheckParamConflict() { } if (is_parallel && monotone_constraints_method == std::string("intermediate")) { // In distributed mode, local node doesn't have histograms on all features, cannot perform "intermediate" monotone constraints. - Log::Warning("cannot use \"intermediate\" monotone constraints in parallel learning, auto set to \"basic\" method."); + Log::Warning("Cannot use \"intermediate\" monotone constraints in parallel learning, auto set to \"basic\" method."); monotone_constraints_method = "basic"; } if (feature_fraction_bynode != 1.0 && monotone_constraints_method == std::string("intermediate")) { // "intermediate" monotone constraints need to recompute splits. If the features are sampled when computing the // split initially, then the sampling needs to be recorded or done once again, which is currently not supported - Log::Warning("cannot use \"intermediate\" monotone constraints with feature fraction different from 1, auto set monotone constraints to \"basic\" method."); + Log::Warning("Cannot use \"intermediate\" monotone constraints with feature fraction different from 1, auto set monotone constraints to \"basic\" method."); monotone_constraints_method = "basic"; } } From a4193a9c7b1f6a5bc8b674ec8719dc8598212dfb Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 19 Mar 2020 09:44:23 +0000 Subject: [PATCH 38/47] Remove useless empty lines. --- src/treelearner/leaf_splits.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index 93c9f71d627f..6d18c3b6f174 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -31,7 +31,6 @@ class LeafSplits { } /*! - * \brief Init split on current leaf on partial data. * \param leaf Index of current leaf * \param data_partition current data partition @@ -46,7 +45,6 @@ class LeafSplits { } /*! - * \brief Init split on current leaf on partial data. * \param leaf Index of current leaf * \param sum_gradients From 2133fa6fbb345de6a2d3817ea3cf1a4f5be0624a Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 19 Mar 2020 09:45:23 +0000 Subject: [PATCH 39/47] Add new line to separate includes. --- src/treelearner/monotone_constraints.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 82cdad771ad1..950b97d16698 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -11,6 +11,7 @@ #include #include #include + #include "split_info.hpp" namespace LightGBM { From 89dbcec55009a93900fb170b7801b74f237801ce Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 19 Mar 2020 09:48:06 +0000 Subject: [PATCH 40/47] Replace unsigned ind by size_t. --- src/treelearner/monotone_constraints.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 950b97d16698..be8e1fde383a 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -128,7 +128,7 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { } #ifdef DEBUG CHECK_GE(new_leaf - 1, 0); - CHECK_LT((unsigned int)(new_leaf - 1), node_parent_.size()); + CHECK_LT(static_cast(new_leaf - 1), node_parent_.size()); #endif node_parent_[new_leaf - 1] = tree->leaf_parent(leaf); } @@ -196,7 +196,7 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { // leaf need to be updated // therefore, for the same feature, there is no use going down from the // second time going up on the right (or on the left) - for (unsigned int split_idx = 0; + for (size_t split_idx = 0; split_idx < features_of_splits_going_up_from_original_leaf.size(); ++split_idx) { if (features_of_splits_going_up_from_original_leaf[split_idx] == @@ -222,7 +222,7 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { const std::vector& best_split_per_leaf) { #ifdef DEBUG CHECK_GE(node_idx, 0); - CHECK_LT((unsigned int)node_idx, node_parent_.size()); + CHECK_LT(static_cast(node_idx), node_parent_.size()); #endif int parent_idx = node_parent_[node_idx]; // if not at the root @@ -412,7 +412,7 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { // the original leaves if so the algorithm should keep going down these nodes // to update constraints if (is_split_numerical) { - for (unsigned int i = 0; + for (size_t i = 0; i < features_of_splits_going_up_from_original_leaf.size(); ++i) { if (features_of_splits_going_up_from_original_leaf[i] == inner_feature) { From 406a79368816fb5767bfd42c0cff6815a84ac4de Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 19 Mar 2020 09:54:08 +0000 Subject: [PATCH 41/47] Reduce number of trials in tests to decrease CI time. --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index d660e6f5e713..b0259ff2127a 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1044,7 +1044,7 @@ def is_correctly_constrained(learner): return False return True - number_of_trials = 10 + number_of_trials = 1 for _ in range(number_of_trials): for monotone_constraints_method in ["basic", "intermediate"]: trainset = self.generate_trainset_for_monotone_constraints_tests() From a1cc5131d4227420804913c4596e34324971328b Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 19 Mar 2020 09:55:51 +0000 Subject: [PATCH 42/47] Specify monotone constraints better in tests. --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index b0259ff2127a..276b8be9bdef 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1051,7 +1051,7 @@ def is_correctly_constrained(learner): params = { 'min_data': 20, 'num_leaves': 20, - 'monotone_constraints': '1,-1,0', + 'monotone_constraints': [1, -1, 0], "monotone_constraints_method": monotone_constraints_method, "use_missing": False, } From 1eb287aa87b7a7f2b048e4284e791349ce229ddd Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 19 Mar 2020 13:08:48 +0000 Subject: [PATCH 43/47] Removed outer loop in test of monotone constraints. --- tests/python_package_test/test_engine.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 276b8be9bdef..373515f3d17c 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1044,19 +1044,17 @@ def is_correctly_constrained(learner): return False return True - number_of_trials = 1 - for _ in range(number_of_trials): - for monotone_constraints_method in ["basic", "intermediate"]: - trainset = self.generate_trainset_for_monotone_constraints_tests() - params = { - 'min_data': 20, - 'num_leaves': 20, - 'monotone_constraints': [1, -1, 0], - "monotone_constraints_method": monotone_constraints_method, - "use_missing": False, - } - constrained_model = lgb.train(params, trainset) - self.assertTrue(is_correctly_constrained(constrained_model)) + for monotone_constraints_method in ["basic", "intermediate"]: + trainset = self.generate_trainset_for_monotone_constraints_tests() + params = { + 'min_data': 20, + 'num_leaves': 20, + 'monotone_constraints': [1, -1, 0], + "monotone_constraints_method": monotone_constraints_method, + "use_missing": False, + } + constrained_model = lgb.train(params, trainset) + self.assertTrue(is_correctly_constrained(constrained_model)) def test_max_bin_by_feature(self): col1 = np.arange(0, 100)[:, np.newaxis] From 51380c267312bfe85a441576e84af6fb7ce05890 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 19 Mar 2020 14:07:14 +0000 Subject: [PATCH 44/47] Added categorical features to the monotone constraints tests. --- tests/python_package_test/test_engine.py | 45 +++++++++++++++--------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 373515f3d17c..98530090c391 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -46,6 +46,9 @@ def constant_metric(preds, train_data): def decreasing_metric(preds, train_data): return ('decreasing_metric', next(decreasing_generator), False) +def categorize(continuous_x): + return np.digitize(continuous_x, bins=np.arange(0, 1, 0.01)) + class TestEngine(unittest.TestCase): def test_binary(self): @@ -995,14 +998,16 @@ def test_init_with_subset(self): self.assertEqual(subset_data_3.get_data(), "lgb_train_data.bin") self.assertEqual(subset_data_4.get_data(), "lgb_train_data.bin") - # test if categorical features and monotone features can both be in a dataset without causing issues - def generate_trainset_for_monotone_constraints_tests(self): + def generate_trainset_for_monotone_constraints_tests(self, x3_to_category=True): number_of_dpoints = 3000 x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints) x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) x3_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) x = np.column_stack( - (x1_positively_correlated_with_y, x2_negatively_correlated_with_y, x3_negatively_correlated_with_y)) + (x1_positively_correlated_with_y, + x2_negatively_correlated_with_y, + categorize(x3_negatively_correlated_with_y) if x3_to_category else x3_negatively_correlated_with_y)) + zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints) scales = 10. * (np.random.random(6) + 0.5) y = (scales[0] * x1_positively_correlated_with_y @@ -1012,7 +1017,10 @@ def generate_trainset_for_monotone_constraints_tests(self): - scales[4] * x3_negatively_correlated_with_y - np.cos(scales[5] * np.pi * x3_negatively_correlated_with_y) + zs) - trainset = lgb.Dataset(x, label=y) + categorical_features = [] + if x3_to_category: + categorical_features = [2] + trainset = lgb.Dataset(x, label=y, categorical_feature=categorical_features) return trainset def test_monotone_constraints(self): @@ -1025,7 +1033,7 @@ def is_decreasing(y): def is_non_monotone(y): return (np.diff(y) < 0.0).any() and (np.diff(y) > 0.0).any() - def is_correctly_constrained(learner): + def is_correctly_constrained(learner, x3_to_category=True): iterations = 10 n = 1000 variable_x = np.linspace(0, 1, n).reshape((n, 1)) @@ -1036,7 +1044,9 @@ def is_correctly_constrained(learner): monotonically_increasing_y = learner.predict(monotonically_increasing_x) monotonically_decreasing_x = np.column_stack((fixed_x, variable_x, fixed_x)) monotonically_decreasing_y = learner.predict(monotonically_decreasing_x) - non_monotone_x = np.column_stack((fixed_x, fixed_x, variable_x)) + non_monotone_x = np.column_stack((fixed_x, + fixed_x, + categorize(variable_x) if x3_to_category else variable_x)) non_monotone_y = learner.predict(non_monotone_x) if not (is_increasing(monotonically_increasing_y) and is_decreasing(monotonically_decreasing_y) @@ -1044,17 +1054,18 @@ def is_correctly_constrained(learner): return False return True - for monotone_constraints_method in ["basic", "intermediate"]: - trainset = self.generate_trainset_for_monotone_constraints_tests() - params = { - 'min_data': 20, - 'num_leaves': 20, - 'monotone_constraints': [1, -1, 0], - "monotone_constraints_method": monotone_constraints_method, - "use_missing": False, - } - constrained_model = lgb.train(params, trainset) - self.assertTrue(is_correctly_constrained(constrained_model)) + for test_with_categorical_variable in [True, False]: + for monotone_constraints_method in ["basic", "intermediate"]: + trainset = self.generate_trainset_for_monotone_constraints_tests(test_with_categorical_variable) + params = { + 'min_data': 20, + 'num_leaves': 20, + 'monotone_constraints': [1, -1, 0], + "monotone_constraints_method": monotone_constraints_method, + "use_missing": False, + } + constrained_model = lgb.train(params, trainset) + self.assertTrue(is_correctly_constrained(constrained_model, test_with_categorical_variable)) def test_max_bin_by_feature(self): col1 = np.arange(0, 100)[:, np.newaxis] From 729fb0ba97d6d7de6d0661d18e58aaccc986293d Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Fri, 20 Mar 2020 08:34:03 +0000 Subject: [PATCH 45/47] Add blank line. --- tests/python_package_test/test_engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 98530090c391..2de34b56f841 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -46,6 +46,7 @@ def constant_metric(preds, train_data): def decreasing_metric(preds, train_data): return ('decreasing_metric', next(decreasing_generator), False) + def categorize(continuous_x): return np.digitize(continuous_x, bins=np.arange(0, 1, 0.01)) From 4024536c37069b2701fb6583e1f29816950f4c75 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Fri, 20 Mar 2020 09:06:07 +0000 Subject: [PATCH 46/47] Regenerate parameters automatically. --- docs/Parameters.rst | 16 ++++++++++------ src/io/config_auto.cpp | 12 ++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 08fd07b0781a..0d9ade659fef 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -376,12 +376,6 @@ Learning Control Parameters - dropout rate: a fraction of previous trees to drop during the dropout -- ``monotone_constraints_method`` :raw-html:`🔗︎`, default = ``basic``, type = string, aliases: ``monotone_constraining_method``, ``mc_method`` - - - used only if ``monotone_constraints`` is set - - - monotone constraints method: if set to "basic", the most basic monotone constraints method will be used. It does not slow the library at all, but over-constrains the predictions. If set to "intermediate", a more advanced method will be used, which may slow the library very slightly. However, the intermediate method is much less constraining than the basic method and should significantly improve the results. - - ``max_drop`` :raw-html:`🔗︎`, default = ``50``, type = int - used only in ``dart`` @@ -466,6 +460,16 @@ Learning Control Parameters - you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature +- ``monotone_constraints_method`` :raw-html:`🔗︎`, default = ``basic``, type = string, aliases: ``monotone_constraining_method``, ``mc_method`` + + - used only if ``monotone_constraints`` is set + + - monotone constraints method + + - ``basic``, the most basic monotone constraints method. It does not slow the library at all, but over-constrains the predictions + + - ``intermediate``, a `more advanced method `__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results + - ``feature_contri`` :raw-html:`🔗︎`, default = ``None``, type = multi-double, aliases: ``feature_contrib``, ``fc``, ``fp``, ``feature_penalty`` - used to control feature's split gain, will use ``gain[i] = max(0, feature_contri[i]) * gain[i]`` to replace the split gain of i-th feature diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index ece6c7ba0bff..5ccba0b6c84b 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -82,11 +82,11 @@ const std::unordered_map& Config::alias_table() { {"lambda", "lambda_l2"}, {"min_split_gain", "min_gain_to_split"}, {"rate_drop", "drop_rate"}, - {"monotone_constraining_method", "monotone_constraints_method"}, - {"mc_method", "monotone_constraints_method"}, {"topk", "top_k"}, {"mc", "monotone_constraints"}, {"monotone_constraint", "monotone_constraints"}, + {"monotone_constraining_method", "monotone_constraints_method"}, + {"mc_method", "monotone_constraints_method"}, {"feature_contrib", "feature_contri"}, {"fc", "feature_contri"}, {"fp", "feature_contri"}, @@ -203,7 +203,6 @@ const std::unordered_set& Config::parameter_set() { "lambda_l2", "min_gain_to_split", "drop_rate", - "monotone_constraints_method", "max_drop", "skip_drop", "xgboost_dart_mode", @@ -218,6 +217,7 @@ const std::unordered_set& Config::parameter_set() { "max_cat_to_onehot", "top_k", "monotone_constraints", + "monotone_constraints_method", "feature_contri", "forcedsplits_filename", "refit_decay_rate", @@ -375,8 +375,6 @@ void Config::GetMembersFromString(const std::unordered_map(tmp_str, ','); } + GetString(params, "monotone_constraints_method", &monotone_constraints_method); + if (GetString(params, "feature_contri", &tmp_str)) { feature_contri = Common::StringToArray(tmp_str, ','); } @@ -624,7 +624,6 @@ std::string Config::SaveMembersToString() const { str_buf << "[lambda_l2: " << lambda_l2 << "]\n"; str_buf << "[min_gain_to_split: " << min_gain_to_split << "]\n"; str_buf << "[drop_rate: " << drop_rate << "]\n"; - str_buf << "[monotone_constraints_method: " << monotone_constraints_method << "]\n"; str_buf << "[max_drop: " << max_drop << "]\n"; str_buf << "[skip_drop: " << skip_drop << "]\n"; str_buf << "[xgboost_dart_mode: " << xgboost_dart_mode << "]\n"; @@ -639,6 +638,7 @@ std::string Config::SaveMembersToString() const { str_buf << "[max_cat_to_onehot: " << max_cat_to_onehot << "]\n"; str_buf << "[top_k: " << top_k << "]\n"; str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast(monotone_constraints), ",") << "]\n"; + str_buf << "[monotone_constraints_method: " << monotone_constraints_method << "]\n"; str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n"; str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n"; str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n"; From 2820a72c25042451fcc624e026a506b2d17d5b20 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 23 Mar 2020 08:14:09 +0000 Subject: [PATCH 47/47] Speed up ShouldKeepGoingLeftRight. --- src/treelearner/monotone_constraints.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index be8e1fde383a..4d804d7fbfa0 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -420,11 +420,17 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { thresholds_of_splits_going_up_from_original_leaf[i] && !was_original_leaf_right_child_of_split[i]) { keep_going_right = false; + if (!keep_going_left) { + break; + } } if (threshold <= thresholds_of_splits_going_up_from_original_leaf[i] && was_original_leaf_right_child_of_split[i]) { keep_going_left = false; + if (!keep_going_right) { + break; + } } } }