Skip to content

Commit

Permalink
[python-package] clarify max_depth warning and limit when it is emitt…
Browse files Browse the repository at this point in the history
…ed (#6402)
  • Loading branch information
jameslamb authored Jun 5, 2024
1 parent e0cda88 commit 8579d5e
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 12 deletions.
3 changes: 2 additions & 1 deletion docs/Parameters-Tuning.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ To get good results using a leaf-wise tree, these are some important parameters:
1. ``num_leaves``. This is the main parameter to control the complexity of the tree model.
Theoretically, we can set ``num_leaves = 2^(max_depth)`` to obtain the same number of leaves as depth-wise tree.
However, this simple conversion is not good in practice.
The reason is that a leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting.
A leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting.
Thus, when trying to tune the ``num_leaves``, we should let it be smaller than ``2^(max_depth)``.
For example, when the ``max_depth=7`` the depth-wise tree can get good accuracy,
but setting ``num_leaves`` to ``127`` may cause over-fitting, and setting it to ``70`` or ``80`` may get better accuracy than depth-wise.
Expand All @@ -33,6 +33,7 @@ To get good results using a leaf-wise tree, these are some important parameters:
In practice, setting it to hundreds or thousands is enough for a large dataset.

3. ``max_depth``. You also can use ``max_depth`` to limit the tree depth explicitly.
If you set ``max_depth``, also explicitly set ``num_leaves`` to some value ``<= 2^max_depth``.

For Faster Speed
----------------
Expand Down
2 changes: 1 addition & 1 deletion include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -1142,7 +1142,7 @@ struct Config {
static const std::string DumpAliases();

private:
void CheckParamConflict();
void CheckParamConflict(const std::unordered_map<std::string, std::string>& params);
void GetMembersFromString(const std::unordered_map<std::string, std::string>& params);
std::string SaveMembersToString() const;
void GetAucMuWeights();
Expand Down
1 change: 1 addition & 0 deletions python-package/lightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,7 @@ def __init__(
Maximum tree leaves for base learners.
max_depth : int, optional (default=-1)
Maximum tree depth for base learners, <=0 means no limit.
If setting this to a positive value, consider also changing ``num_leaves`` to ``<= 2^max_depth``.
learning_rate : float, optional (default=0.1)
Boosting learning rate.
You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate
Expand Down
28 changes: 19 additions & 9 deletions src/io/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,14 +289,14 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
}

// check for conflicts
CheckParamConflict();
CheckParamConflict(params);
}

bool CheckMultiClassObjective(const std::string& objective) {
return (objective == std::string("multiclass") || objective == std::string("multiclassova"));
}

void Config::CheckParamConflict() {
void Config::CheckParamConflict(const std::unordered_map<std::string, std::string>& params) {
// check if objective, metric, and num_class match
int num_class_check = num_class;
bool objective_type_multiclass = CheckMultiClassObjective(objective) || (objective == std::string("custom") && num_class_check > 1);
Expand Down Expand Up @@ -356,14 +356,24 @@ void Config::CheckParamConflict() {
tree_learner.c_str());
}
}
// Check max_depth and num_leaves
if (max_depth > 0) {

// max_depth defaults to -1, so max_depth>0 implies "you explicitly overrode the default"
//
// Changing max_depth while leaving num_leaves at its default (31) can lead to 2 undesirable situations:
//
// * (0 <= max_depth <= 4) it's not possible to produce a tree with 31 leaves
// - this block reduces num_leaves to 2^max_depth
// * (max_depth > 4) 31 leaves is less than a full depth-wise tree, which might lead to underfitting
// - this block warns about that
// ref: https://github.com/microsoft/LightGBM/issues/2898#issuecomment-1002860601
if (max_depth > 0 && (params.count("num_leaves") == 0 || params.at("num_leaves").empty())) {
double full_num_leaves = std::pow(2, max_depth);
if (full_num_leaves > num_leaves
&& num_leaves == kDefaultNumLeaves) {
Log::Warning("Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves."
" (num_leaves=%d).",
num_leaves);
if (full_num_leaves > num_leaves) {
Log::Warning("Provided parameters constrain tree depth (max_depth=%d) without explicitly setting 'num_leaves'. "
"This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<=%.0f) in params. "
"Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity.",
max_depth,
full_num_leaves);
}

if (full_num_leaves < num_leaves) {
Expand Down
56 changes: 55 additions & 1 deletion tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
import pytest
from scipy import sparse
from sklearn.datasets import dump_svmlight_file, load_svmlight_file
from sklearn.datasets import dump_svmlight_file, load_svmlight_file, make_blobs
from sklearn.model_selection import train_test_split

import lightgbm as lgb
Expand Down Expand Up @@ -890,3 +890,57 @@ def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Datas
data=rng.standard_normal(size=(100, 3)),
)
assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]


# NOTE: this intentionally contains values where num_leaves <, ==, and > (max_depth^2)
@pytest.mark.parametrize(("max_depth", "num_leaves"), [(-1, 3), (-1, 50), (5, 3), (5, 31), (5, 32), (8, 3), (8, 31)])
def test_max_depth_warning_is_not_raised_if_num_leaves_is_also_provided(capsys, num_leaves, max_depth):
X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
lgb.Booster(
params={
"objective": "binary",
"max_depth": max_depth,
"num_leaves": num_leaves,
"num_iterations": 1,
"verbose": 0,
},
train_set=lgb.Dataset(X, label=y),
)
assert "Provided parameters constrain tree depth" not in capsys.readouterr().out


# NOTE: max_depth < 5 is significant here because the default for num_leaves=31. With max_depth=5,
# a full depth-wise tree would have 2^5 = 32 leaves.
@pytest.mark.parametrize("max_depth", [1, 2, 3, 4])
def test_max_depth_warning_is_not_raised_if_max_depth_gt_1_and_lt_5_and_num_leaves_omitted(capsys, max_depth):
X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
lgb.Booster(
params={
"objective": "binary",
"max_depth": max_depth,
"num_iterations": 1,
"verbose": 0,
},
train_set=lgb.Dataset(X, label=y),
)
assert "Provided parameters constrain tree depth" not in capsys.readouterr().out


@pytest.mark.parametrize("max_depth", [5, 6, 7, 8, 9])
def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(capsys, max_depth):
X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
lgb.Booster(
params={
"objective": "binary",
"max_depth": max_depth,
"num_iterations": 1,
"verbose": 0,
},
train_set=lgb.Dataset(X, label=y),
)
expected_warning = (
f"[LightGBM] [Warning] Provided parameters constrain tree depth (max_depth={max_depth}) without explicitly "
f"setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<={2**max_depth}) "
"in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity."
)
assert expected_warning in capsys.readouterr().out
14 changes: 14 additions & 0 deletions tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -1276,6 +1276,20 @@ def test_check_is_fitted():
check_is_fitted(model)


@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
@pytest.mark.parametrize("max_depth", [3, 4, 5, 8])
def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth):
X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
params = {"n_estimators": 1, "max_depth": max_depth, "verbose": 0}
if estimator_class is lgb.LGBMModel:
estimator_class(**{**params, "objective": "binary"}).fit(X, y)
elif estimator_class is lgb.LGBMRanker:
estimator_class(**params).fit(X, y, group=np.ones(X.shape[0]))
else:
estimator_class(**params).fit(X, y)
assert "Provided parameters constrain tree depth" not in capsys.readouterr().out


@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()])
def test_sklearn_integration(estimator, check):
estimator.set_params(min_child_samples=1, min_data_in_bin=1)
Expand Down

0 comments on commit 8579d5e

Please sign in to comment.