From 8579d5e34f97d797e10cf1e87e5093330992f520 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 4 Jun 2024 19:46:07 -0500 Subject: [PATCH] [python-package] clarify max_depth warning and limit when it is emitted (#6402) --- docs/Parameters-Tuning.rst | 3 +- include/LightGBM/config.h | 2 +- python-package/lightgbm/sklearn.py | 1 + src/io/config.cpp | 28 ++++++++---- tests/python_package_test/test_basic.py | 56 ++++++++++++++++++++++- tests/python_package_test/test_sklearn.py | 14 ++++++ 6 files changed, 92 insertions(+), 12 deletions(-) diff --git a/docs/Parameters-Tuning.rst b/docs/Parameters-Tuning.rst index ece235f6e6c0..9a3593f7e891 100644 --- a/docs/Parameters-Tuning.rst +++ b/docs/Parameters-Tuning.rst @@ -22,7 +22,7 @@ To get good results using a leaf-wise tree, these are some important parameters: 1. ``num_leaves``. This is the main parameter to control the complexity of the tree model. Theoretically, we can set ``num_leaves = 2^(max_depth)`` to obtain the same number of leaves as depth-wise tree. However, this simple conversion is not good in practice. - The reason is that a leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting. + A leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting. Thus, when trying to tune the ``num_leaves``, we should let it be smaller than ``2^(max_depth)``. For example, when the ``max_depth=7`` the depth-wise tree can get good accuracy, but setting ``num_leaves`` to ``127`` may cause over-fitting, and setting it to ``70`` or ``80`` may get better accuracy than depth-wise. @@ -33,6 +33,7 @@ To get good results using a leaf-wise tree, these are some important parameters: In practice, setting it to hundreds or thousands is enough for a large dataset. 3. ``max_depth``. You also can use ``max_depth`` to limit the tree depth explicitly. + If you set ``max_depth``, also explicitly set ``num_leaves`` to some value ``<= 2^max_depth``. For Faster Speed ---------------- diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index b626e1b1bcc2..b9d71aaa9882 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -1142,7 +1142,7 @@ struct Config { static const std::string DumpAliases(); private: - void CheckParamConflict(); + void CheckParamConflict(const std::unordered_map& params); void GetMembersFromString(const std::unordered_map& params); std::string SaveMembersToString() const; void GetAucMuWeights(); diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 46f41a428348..cb577c18c265 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -492,6 +492,7 @@ def __init__( Maximum tree leaves for base learners. max_depth : int, optional (default=-1) Maximum tree depth for base learners, <=0 means no limit. + If setting this to a positive value, consider also changing ``num_leaves`` to ``<= 2^max_depth``. learning_rate : float, optional (default=0.1) Boosting learning rate. You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate diff --git a/src/io/config.cpp b/src/io/config.cpp index e25bb6d4fd70..7516ddbd4ac6 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -289,14 +289,14 @@ void Config::Set(const std::unordered_map& params) { } // check for conflicts - CheckParamConflict(); + CheckParamConflict(params); } bool CheckMultiClassObjective(const std::string& objective) { return (objective == std::string("multiclass") || objective == std::string("multiclassova")); } -void Config::CheckParamConflict() { +void Config::CheckParamConflict(const std::unordered_map& params) { // check if objective, metric, and num_class match int num_class_check = num_class; bool objective_type_multiclass = CheckMultiClassObjective(objective) || (objective == std::string("custom") && num_class_check > 1); @@ -356,14 +356,24 @@ void Config::CheckParamConflict() { tree_learner.c_str()); } } - // Check max_depth and num_leaves - if (max_depth > 0) { + + // max_depth defaults to -1, so max_depth>0 implies "you explicitly overrode the default" + // + // Changing max_depth while leaving num_leaves at its default (31) can lead to 2 undesirable situations: + // + // * (0 <= max_depth <= 4) it's not possible to produce a tree with 31 leaves + // - this block reduces num_leaves to 2^max_depth + // * (max_depth > 4) 31 leaves is less than a full depth-wise tree, which might lead to underfitting + // - this block warns about that + // ref: https://github.com/microsoft/LightGBM/issues/2898#issuecomment-1002860601 + if (max_depth > 0 && (params.count("num_leaves") == 0 || params.at("num_leaves").empty())) { double full_num_leaves = std::pow(2, max_depth); - if (full_num_leaves > num_leaves - && num_leaves == kDefaultNumLeaves) { - Log::Warning("Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves." - " (num_leaves=%d).", - num_leaves); + if (full_num_leaves > num_leaves) { + Log::Warning("Provided parameters constrain tree depth (max_depth=%d) without explicitly setting 'num_leaves'. " + "This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<=%.0f) in params. " + "Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity.", + max_depth, + full_num_leaves); } if (full_num_leaves < num_leaves) { diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index e2f379dad9d3..14a621a1604f 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -9,7 +9,7 @@ import numpy as np import pytest from scipy import sparse -from sklearn.datasets import dump_svmlight_file, load_svmlight_file +from sklearn.datasets import dump_svmlight_file, load_svmlight_file, make_blobs from sklearn.model_selection import train_test_split import lightgbm as lgb @@ -890,3 +890,57 @@ def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Datas data=rng.standard_normal(size=(100, 3)), ) assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"] + + +# NOTE: this intentionally contains values where num_leaves <, ==, and > (max_depth^2) +@pytest.mark.parametrize(("max_depth", "num_leaves"), [(-1, 3), (-1, 50), (5, 3), (5, 31), (5, 32), (8, 3), (8, 31)]) +def test_max_depth_warning_is_not_raised_if_num_leaves_is_also_provided(capsys, num_leaves, max_depth): + X, y = make_blobs(n_samples=1_000, n_features=1, centers=2) + lgb.Booster( + params={ + "objective": "binary", + "max_depth": max_depth, + "num_leaves": num_leaves, + "num_iterations": 1, + "verbose": 0, + }, + train_set=lgb.Dataset(X, label=y), + ) + assert "Provided parameters constrain tree depth" not in capsys.readouterr().out + + +# NOTE: max_depth < 5 is significant here because the default for num_leaves=31. With max_depth=5, +# a full depth-wise tree would have 2^5 = 32 leaves. +@pytest.mark.parametrize("max_depth", [1, 2, 3, 4]) +def test_max_depth_warning_is_not_raised_if_max_depth_gt_1_and_lt_5_and_num_leaves_omitted(capsys, max_depth): + X, y = make_blobs(n_samples=1_000, n_features=1, centers=2) + lgb.Booster( + params={ + "objective": "binary", + "max_depth": max_depth, + "num_iterations": 1, + "verbose": 0, + }, + train_set=lgb.Dataset(X, label=y), + ) + assert "Provided parameters constrain tree depth" not in capsys.readouterr().out + + +@pytest.mark.parametrize("max_depth", [5, 6, 7, 8, 9]) +def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(capsys, max_depth): + X, y = make_blobs(n_samples=1_000, n_features=1, centers=2) + lgb.Booster( + params={ + "objective": "binary", + "max_depth": max_depth, + "num_iterations": 1, + "verbose": 0, + }, + train_set=lgb.Dataset(X, label=y), + ) + expected_warning = ( + f"[LightGBM] [Warning] Provided parameters constrain tree depth (max_depth={max_depth}) without explicitly " + f"setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<={2**max_depth}) " + "in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity." + ) + assert expected_warning in capsys.readouterr().out diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index a995bfcae6b2..b458192a2ee0 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1276,6 +1276,20 @@ def test_check_is_fitted(): check_is_fitted(model) +@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) +@pytest.mark.parametrize("max_depth", [3, 4, 5, 8]) +def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth): + X, y = make_blobs(n_samples=1_000, n_features=1, centers=2) + params = {"n_estimators": 1, "max_depth": max_depth, "verbose": 0} + if estimator_class is lgb.LGBMModel: + estimator_class(**{**params, "objective": "binary"}).fit(X, y) + elif estimator_class is lgb.LGBMRanker: + estimator_class(**params).fit(X, y, group=np.ones(X.shape[0])) + else: + estimator_class(**params).fit(X, y) + assert "Provided parameters constrain tree depth" not in capsys.readouterr().out + + @parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()]) def test_sklearn_integration(estimator, check): estimator.set_params(min_child_samples=1, min_data_in_bin=1)