Skip to content

Commit

Permalink
Clear split info buffer in cost efficient gradient boosting before ev…
Browse files Browse the repository at this point in the history
…ery iteration (fix partially #3679) (#5164)

* clear split info buffer in cegb_ before every iteration

* check nullable of cegb_ in serial_tree_learner.cpp

* add a test case for checking the split buffer in CEGB

* swith to Threading::For instead of raw OpenMP

* apply review suggestions

* apply review comments

* remove device cpu
  • Loading branch information
shiyu1994 authored Jun 8, 2022
1 parent 27d9ad2 commit f1328d5
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 0 deletions.
14 changes: 14 additions & 0 deletions src/treelearner/cost_effective_gradient_boosting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <LightGBM/dataset.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/threading.h>

#include <vector>

Expand All @@ -32,6 +33,7 @@ class CostEfficientGradientBoosting {
return true;
}
}

void Init() {
auto train_data = tree_learner_->train_data_;
if (!init_) {
Expand Down Expand Up @@ -63,6 +65,17 @@ class CostEfficientGradientBoosting {
}
init_ = true;
}

void BeforeTrain() {
// clear the splits in splits_per_leaf_
Threading::For<size_t>(0, splits_per_leaf_.size(), 1024,
[this] (int /*thread_index*/, size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
splits_per_leaf_[i].Reset();
}
});
}

double DeltaGain(int feature_index, int real_fidx, int leaf_index,
int num_data_in_leaf, SplitInfo split_info) {
auto config = tree_learner_->config_;
Expand All @@ -82,6 +95,7 @@ class CostEfficientGradientBoosting {
feature_index] = split_info;
return delta;
}

void UpdateLeafBestSplits(Tree* tree, int best_leaf,
const SplitInfo* best_split_info,
std::vector<SplitInfo>* best_split_per_leaf) {
Expand Down
4 changes: 4 additions & 0 deletions src/treelearner/serial_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,10 @@ void SerialTreeLearner::BeforeTrain() {
}

larger_leaf_splits_->Init();

if (cegb_ != nullptr) {
cegb_->BeforeTrain();
}
}

bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
Expand Down
45 changes: 45 additions & 0 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3578,3 +3578,48 @@ def test_boost_from_average_with_single_leaf_trees():
preds = model.predict(X)
mean_preds = np.mean(preds)
assert y.min() <= mean_preds <= y.max()


def test_cegb_split_buffer_clean():
# modified from https://github.com/microsoft/LightGBM/issues/3679#issuecomment-938652811
# and https://github.com/microsoft/LightGBM/pull/5087
# test that the ``splits_per_leaf_`` of CEGB is cleaned before training a new tree
# which is done in the fix #5164
# without the fix:
# Check failed: (best_split_info.left_count) > (0)

R, C = 1000, 100
seed = 29
np.random.seed(seed)
data = np.random.randn(R, C)
for i in range(1, C):
data[i] += data[0] * np.random.randn()

N = int(0.8 * len(data))
train_data = data[:N]
test_data = data[N:]
train_y = np.sum(train_data, axis=1)
test_y = np.sum(test_data, axis=1)

train = lgb.Dataset(train_data, train_y, free_raw_data=True)

params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'max_bin': 255,
'num_leaves': 31,
'seed': 0,
'learning_rate': 0.1,
'min_data_in_leaf': 0,
'verbose': -1,
'min_split_gain': 1000.0,
'cegb_penalty_feature_coupled': 5 * np.arange(C),
'cegb_penalty_split': 0.0002,
'cegb_tradeoff': 10.0,
'force_col_wise': True,
}

model = lgb.train(params, train, num_boost_round=10)
predicts = model.predict(test_data)
rmse = np.sqrt(mean_squared_error(test_y, predicts))
assert rmse < 10.0

0 comments on commit f1328d5

Please sign in to comment.