microsoft · jameslamb · Jun 8, 2022 · Apr 21, 2022 · Apr 24, 2022 · May 5, 2022
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
@@ -430,52 +430,6 @@ def test_cegb_scaling_equalities(tmp_path):
         assert p1txt == p2txt
 
 
-def test_cegb_split_buffer_clean():
-    # modified from https://github.com/microsoft/LightGBM/issues/3679#issuecomment-938652811
-    # and https://github.com/microsoft/LightGBM/pull/5087
-    # test that the ``splits_per_leaf_`` of CEGB is cleaned before training a new tree
-    # which is done in the fix #5164
-    # without the fix:
-    #    Check failed: (best_split_info.left_count) > (0)
-
-    R, C = 1000, 100
-    seed = 29
-    np.random.seed(seed)
-    data = np.random.randn(R, C)
-    for i in range(1, C):
-        data[i] += data[0] * np.random.randn()
-
-    N = int(0.8 * len(data))
-    train_data = data[:N]
-    test_data = data[N:]
-    train_y = np.sum(train_data, axis=1)
-    test_y = np.sum(test_data, axis=1)
-
-    train = lgb.Dataset(train_data, train_y, free_raw_data=True)
-    test = lgb.Dataset(test_data, test_y, free_raw_data=True, reference=train)
-
-    # The test is run twice, on cpu and gpu
-    params = {
-        'device': "cpu",
-        'boosting_type': 'gbdt',
-        'objective': 'regression',
-        'max_bin': 255,
-        'num_leaves': 31,
-        'seed': 0,
-        'learning_rate': 0.1,
-        'min_data_in_leaf': 0,
-        'verbose': 2,
-        'min_split_gain': 1000.0,
-        'cegb_penalty_feature_coupled': 5 * np.arange(C),
-        'cegb_penalty_split': 0.0002,
-        'cegb_tradeoff': 10.0,
-        'num_threads': 16,
-        'force_col_wise': True,
-    }
-
-    lgb.train(params, train, num_boost_round=20, valid_sets=test)
-
-
 def test_consistent_state_for_dataset_fields():
 
     def check_asserts(data):

@@ -3566,3 +3566,50 @@ def test_boost_from_average_with_single_leaf_trees():
     preds = model.predict(X)
     mean_preds = np.mean(preds)
     assert y.min() <= mean_preds <= y.max()
+
+
+def test_cegb_split_buffer_clean():
+    # modified from https://github.com/microsoft/LightGBM/issues/3679#issuecomment-938652811
+    # and https://github.com/microsoft/LightGBM/pull/5087
+    # test that the ``splits_per_leaf_`` of CEGB is cleaned before training a new tree
+    # which is done in the fix #5164
+    # without the fix:
+    #    Check failed: (best_split_info.left_count) > (0)
+
+    R, C = 1000, 100
+    seed = 29
+    np.random.seed(seed)
+    data = np.random.randn(R, C)
+    for i in range(1, C):
+        data[i] += data[0] * np.random.randn()
+
+    N = int(0.8 * len(data))
+    train_data = data[:N]
+    test_data = data[N:]
+    train_y = np.sum(train_data, axis=1)
+    test_y = np.sum(test_data, axis=1)
+
+    train = lgb.Dataset(train_data, train_y, free_raw_data=True)
+
+    params = {
+        'device': "cpu",
+        'boosting_type': 'gbdt',
+        'objective': 'regression',
+        'max_bin': 255,
+        'num_leaves': 31,
+        'seed': 0,
+        'learning_rate': 0.1,
+        'min_data_in_leaf': 0,
+        'verbose': -1,
+        'min_split_gain': 1000.0,
+        'cegb_penalty_feature_coupled': 5 * np.arange(C),
+        'cegb_penalty_split': 0.0002,
+        'cegb_tradeoff': 10.0,
+        'num_threads': 16,
+        'force_col_wise': True,
+    }
+
+    model = lgb.train(params, train, num_boost_round=10)
+    predicts = model.predict(test_data)
+    rmse = np.sqrt(np.mean((predicts - test_y) ** 2))
+    assert rmse < 10.0