Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clear split info buffer in cost efficient gradient boosting before every iteration (fix partially #3679) #5164

Merged
merged 9 commits into from
Jun 8, 2022
13 changes: 13 additions & 0 deletions src/treelearner/cost_effective_gradient_boosting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class CostEfficientGradientBoosting {
return true;
}
}

void Init() {
auto train_data = tree_learner_->train_data_;
if (!init_) {
Expand Down Expand Up @@ -63,6 +64,17 @@ class CostEfficientGradientBoosting {
}
init_ = true;
}

void BeforeTrain() {
// clear the splits in splits_per_leaf_
const int num_total_splits = static_cast<int>(splits_per_leaf_.size());
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
const int num_threads = OMP_NUM_THREADS();
#pragma omp parallel for schedule(static) num_threads(num_threads)
for (int i = 0; i < num_total_splits; ++i) {
splits_per_leaf_[i].Reset();
}
}

double DeltaGain(int feature_index, int real_fidx, int leaf_index,
int num_data_in_leaf, SplitInfo split_info) {
auto config = tree_learner_->config_;
Expand All @@ -82,6 +94,7 @@ class CostEfficientGradientBoosting {
feature_index] = split_info;
return delta;
}

void UpdateLeafBestSplits(Tree* tree, int best_leaf,
const SplitInfo* best_split_info,
std::vector<SplitInfo>* best_split_per_leaf) {
Expand Down
4 changes: 4 additions & 0 deletions src/treelearner/serial_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,10 @@ void SerialTreeLearner::BeforeTrain() {
}

larger_leaf_splits_->Init();

if (cegb_ != nullptr) {
cegb_->BeforeTrain();
}
}

bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
Expand Down
46 changes: 46 additions & 0 deletions tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,52 @@ def test_cegb_scaling_equalities(tmp_path):
assert p1txt == p2txt


def test_cegb_split_buffer_clean():
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
# modified from https://github.com/microsoft/LightGBM/issues/3679#issuecomment-938652811
# and https://github.com/microsoft/LightGBM/pull/5087
# test that the ``splits_per_leaf_`` of CEGB is cleaned before training a new tree
# which is done in the fix #5164
# without the fix:
# Check failed: (best_split_info.left_count) > (0)

R, C = 1000, 100
seed = 29
np.random.seed(seed)
data = np.random.randn(R, C)
for i in range(1, C):
data[i] += data[0] * np.random.randn()

N = int(0.8 * len(data))
train_data = data[:N]
test_data = data[N:]
train_y = np.sum(train_data, axis=1)
test_y = np.sum(test_data, axis=1)

train = lgb.Dataset(train_data, train_y, free_raw_data=True)
test = lgb.Dataset(test_data, test_y, free_raw_data=True, reference=train)

# The test is run twice, on cpu and gpu
params = {
'device': "cpu",
'boosting_type': 'gbdt',
'objective': 'regression',
'max_bin': 255,
'num_leaves': 31,
'seed': 0,
'learning_rate': 0.1,
'min_data_in_leaf': 0,
'verbose': 2,
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
'min_split_gain': 1000.0,
'cegb_penalty_feature_coupled': 5 * np.arange(C),
'cegb_penalty_split': 0.0002,
'cegb_tradeoff': 10.0,
'num_threads': 16,
'force_col_wise': True,
}

lgb.train(params, train, num_boost_round=20, valid_sets=test)
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved


def test_consistent_state_for_dataset_fields():

def check_asserts(data):
Expand Down