Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clear split info buffer in cost efficient gradient boosting before every iteration (fix partially #3679) #5164

Merged
merged 9 commits into from
Jun 8, 2022
46 changes: 0 additions & 46 deletions tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,52 +430,6 @@ def test_cegb_scaling_equalities(tmp_path):
assert p1txt == p2txt


def test_cegb_split_buffer_clean():
# modified from https://github.com/microsoft/LightGBM/issues/3679#issuecomment-938652811
# and https://github.com/microsoft/LightGBM/pull/5087
# test that the ``splits_per_leaf_`` of CEGB is cleaned before training a new tree
# which is done in the fix #5164
# without the fix:
# Check failed: (best_split_info.left_count) > (0)

R, C = 1000, 100
seed = 29
np.random.seed(seed)
data = np.random.randn(R, C)
for i in range(1, C):
data[i] += data[0] * np.random.randn()

N = int(0.8 * len(data))
train_data = data[:N]
test_data = data[N:]
train_y = np.sum(train_data, axis=1)
test_y = np.sum(test_data, axis=1)

train = lgb.Dataset(train_data, train_y, free_raw_data=True)
test = lgb.Dataset(test_data, test_y, free_raw_data=True, reference=train)

# The test is run twice, on cpu and gpu
params = {
'device': "cpu",
'boosting_type': 'gbdt',
'objective': 'regression',
'max_bin': 255,
'num_leaves': 31,
'seed': 0,
'learning_rate': 0.1,
'min_data_in_leaf': 0,
'verbose': 2,
'min_split_gain': 1000.0,
'cegb_penalty_feature_coupled': 5 * np.arange(C),
'cegb_penalty_split': 0.0002,
'cegb_tradeoff': 10.0,
'num_threads': 16,
'force_col_wise': True,
}

lgb.train(params, train, num_boost_round=20, valid_sets=test)


def test_consistent_state_for_dataset_fields():

def check_asserts(data):
Expand Down
47 changes: 47 additions & 0 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3566,3 +3566,50 @@ def test_boost_from_average_with_single_leaf_trees():
preds = model.predict(X)
mean_preds = np.mean(preds)
assert y.min() <= mean_preds <= y.max()


def test_cegb_split_buffer_clean():
# modified from https://github.com/microsoft/LightGBM/issues/3679#issuecomment-938652811
# and https://github.com/microsoft/LightGBM/pull/5087
# test that the ``splits_per_leaf_`` of CEGB is cleaned before training a new tree
# which is done in the fix #5164
# without the fix:
# Check failed: (best_split_info.left_count) > (0)

R, C = 1000, 100
seed = 29
np.random.seed(seed)
data = np.random.randn(R, C)
for i in range(1, C):
data[i] += data[0] * np.random.randn()

N = int(0.8 * len(data))
train_data = data[:N]
test_data = data[N:]
train_y = np.sum(train_data, axis=1)
test_y = np.sum(test_data, axis=1)

train = lgb.Dataset(train_data, train_y, free_raw_data=True)

params = {
'device': "cpu",
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
'boosting_type': 'gbdt',
'objective': 'regression',
'max_bin': 255,
'num_leaves': 31,
'seed': 0,
'learning_rate': 0.1,
'min_data_in_leaf': 0,
'verbose': -1,
'min_split_gain': 1000.0,
'cegb_penalty_feature_coupled': 5 * np.arange(C),
'cegb_penalty_split': 0.0002,
'cegb_tradeoff': 10.0,
'num_threads': 16,
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
'force_col_wise': True,
}

model = lgb.train(params, train, num_boost_round=10)
predicts = model.predict(test_data)
rmse = np.sqrt(np.mean((predicts - test_y) ** 2))
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
assert rmse < 10.0