Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clear split info buffer in cost efficient gradient boosting before every iteration (fix partially #3679) #5164

Merged
merged 9 commits into from
Jun 8, 2022
14 changes: 14 additions & 0 deletions src/treelearner/cost_effective_gradient_boosting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <LightGBM/dataset.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/threading.h>

#include <vector>

Expand All @@ -32,6 +33,7 @@ class CostEfficientGradientBoosting {
return true;
}
}

void Init() {
auto train_data = tree_learner_->train_data_;
if (!init_) {
Expand Down Expand Up @@ -63,6 +65,17 @@ class CostEfficientGradientBoosting {
}
init_ = true;
}

void BeforeTrain() {
// clear the splits in splits_per_leaf_
Threading::For<size_t>(0, splits_per_leaf_.size(), 1024,
[this] (int /*thread_index*/, size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
splits_per_leaf_[i].Reset();
}
});
}

double DeltaGain(int feature_index, int real_fidx, int leaf_index,
int num_data_in_leaf, SplitInfo split_info) {
auto config = tree_learner_->config_;
Expand All @@ -82,6 +95,7 @@ class CostEfficientGradientBoosting {
feature_index] = split_info;
return delta;
}

void UpdateLeafBestSplits(Tree* tree, int best_leaf,
const SplitInfo* best_split_info,
std::vector<SplitInfo>* best_split_per_leaf) {
Expand Down
4 changes: 4 additions & 0 deletions src/treelearner/serial_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,10 @@ void SerialTreeLearner::BeforeTrain() {
}

larger_leaf_splits_->Init();

if (cegb_ != nullptr) {
cegb_->BeforeTrain();
}
}

bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
Expand Down
47 changes: 47 additions & 0 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3566,3 +3566,50 @@ def test_boost_from_average_with_single_leaf_trees():
preds = model.predict(X)
mean_preds = np.mean(preds)
assert y.min() <= mean_preds <= y.max()


def test_cegb_split_buffer_clean():
# modified from https://github.com/microsoft/LightGBM/issues/3679#issuecomment-938652811
# and https://github.com/microsoft/LightGBM/pull/5087
# test that the ``splits_per_leaf_`` of CEGB is cleaned before training a new tree
# which is done in the fix #5164
# without the fix:
# Check failed: (best_split_info.left_count) > (0)

R, C = 1000, 100
seed = 29
np.random.seed(seed)
data = np.random.randn(R, C)
for i in range(1, C):
data[i] += data[0] * np.random.randn()

N = int(0.8 * len(data))
train_data = data[:N]
test_data = data[N:]
train_y = np.sum(train_data, axis=1)
test_y = np.sum(test_data, axis=1)

train = lgb.Dataset(train_data, train_y, free_raw_data=True)

params = {
'device': "cpu",
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
'boosting_type': 'gbdt',
'objective': 'regression',
'max_bin': 255,
'num_leaves': 31,
'seed': 0,
'learning_rate': 0.1,
'min_data_in_leaf': 0,
'verbose': -1,
'min_split_gain': 1000.0,
'cegb_penalty_feature_coupled': 5 * np.arange(C),
'cegb_penalty_split': 0.0002,
'cegb_tradeoff': 10.0,
'num_threads': 16,
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
'force_col_wise': True,
}

model = lgb.train(params, train, num_boost_round=10)
predicts = model.predict(test_data)
rmse = np.sqrt(np.mean((predicts - test_y) ** 2))
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
assert rmse < 10.0