From 8579d5e34f97d797e10cf1e87e5093330992f520 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 4 Jun 2024 19:46:07 -0500
Subject: [PATCH] [python-package] clarify max_depth warning and limit when it
 is emitted (#6402)

---
 docs/Parameters-Tuning.rst                |  3 +-
 include/LightGBM/config.h                 |  2 +-
 python-package/lightgbm/sklearn.py        |  1 +
 src/io/config.cpp                         | 28 ++++++++----
 tests/python_package_test/test_basic.py   | 56 ++++++++++++++++++++++-
 tests/python_package_test/test_sklearn.py | 14 ++++++
 6 files changed, 92 insertions(+), 12 deletions(-)

diff --git a/docs/Parameters-Tuning.rst b/docs/Parameters-Tuning.rst
index ece235f6e6c0..9a3593f7e891 100644
--- a/docs/Parameters-Tuning.rst
+++ b/docs/Parameters-Tuning.rst
@@ -22,7 +22,7 @@ To get good results using a leaf-wise tree, these are some important parameters:
 1. ``num_leaves``. This is the main parameter to control the complexity of the tree model.
    Theoretically, we can set ``num_leaves = 2^(max_depth)`` to obtain the same number of leaves as depth-wise tree.
    However, this simple conversion is not good in practice.
-   The reason is that a leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting.
+   A leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting.
    Thus, when trying to tune the ``num_leaves``, we should let it be smaller than ``2^(max_depth)``.
    For example, when the ``max_depth=7`` the depth-wise tree can get good accuracy,
    but setting ``num_leaves`` to ``127`` may cause over-fitting, and setting it to ``70`` or ``80`` may get better accuracy than depth-wise.
@@ -33,6 +33,7 @@ To get good results using a leaf-wise tree, these are some important parameters:
    In practice, setting it to hundreds or thousands is enough for a large dataset.
 
 3. ``max_depth``. You also can use ``max_depth`` to limit the tree depth explicitly.
+   If you set ``max_depth``, also explicitly set ``num_leaves`` to some value ``<= 2^max_depth``.
 
 For Faster Speed
 ----------------
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index b626e1b1bcc2..b9d71aaa9882 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -1142,7 +1142,7 @@ struct Config {
   static const std::string DumpAliases();
 
  private:
-  void CheckParamConflict();
+  void CheckParamConflict(const std::unordered_map<std::string, std::string>& params);
   void GetMembersFromString(const std::unordered_map<std::string, std::string>& params);
   std::string SaveMembersToString() const;
   void GetAucMuWeights();
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 46f41a428348..cb577c18c265 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -492,6 +492,7 @@ def __init__(
             Maximum tree leaves for base learners.
         max_depth : int, optional (default=-1)
             Maximum tree depth for base learners, <=0 means no limit.
+            If setting this to a positive value, consider also changing ``num_leaves`` to ``<= 2^max_depth``.
         learning_rate : float, optional (default=0.1)
             Boosting learning rate.
             You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate
diff --git a/src/io/config.cpp b/src/io/config.cpp
index e25bb6d4fd70..7516ddbd4ac6 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -289,14 +289,14 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
   }
 
   // check for conflicts
-  CheckParamConflict();
+  CheckParamConflict(params);
 }
 
 bool CheckMultiClassObjective(const std::string& objective) {
   return (objective == std::string("multiclass") || objective == std::string("multiclassova"));
 }
 
-void Config::CheckParamConflict() {
+void Config::CheckParamConflict(const std::unordered_map<std::string, std::string>& params) {
   // check if objective, metric, and num_class match
   int num_class_check = num_class;
   bool objective_type_multiclass = CheckMultiClassObjective(objective) || (objective == std::string("custom") && num_class_check > 1);
@@ -356,14 +356,24 @@ void Config::CheckParamConflict() {
                  tree_learner.c_str());
     }
   }
-  // Check max_depth and num_leaves
-  if (max_depth > 0) {
+
+  // max_depth defaults to -1, so max_depth>0 implies "you explicitly overrode the default"
+  //
+  // Changing max_depth while leaving num_leaves at its default (31) can lead to 2 undesirable situations:
+  //
+  //   * (0 <= max_depth <= 4) it's not possible to produce a tree with 31 leaves
+  //     - this block reduces num_leaves to 2^max_depth
+  //   * (max_depth > 4) 31 leaves is less than a full depth-wise tree, which might lead to underfitting
+  //     - this block warns about that
+  // ref: https://github.com/microsoft/LightGBM/issues/2898#issuecomment-1002860601
+  if (max_depth > 0 && (params.count("num_leaves") == 0 || params.at("num_leaves").empty())) {
     double full_num_leaves = std::pow(2, max_depth);
-    if (full_num_leaves > num_leaves
-        && num_leaves == kDefaultNumLeaves) {
-      Log::Warning("Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves."
-                   " (num_leaves=%d).",
-                   num_leaves);
+    if (full_num_leaves > num_leaves) {
+      Log::Warning("Provided parameters constrain tree depth (max_depth=%d) without explicitly setting 'num_leaves'. "
+                   "This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<=%.0f) in params. "
+                   "Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity.",
+                   max_depth,
+                   full_num_leaves);
     }
 
     if (full_num_leaves < num_leaves) {
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index e2f379dad9d3..14a621a1604f 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -9,7 +9,7 @@
 import numpy as np
 import pytest
 from scipy import sparse
-from sklearn.datasets import dump_svmlight_file, load_svmlight_file
+from sklearn.datasets import dump_svmlight_file, load_svmlight_file, make_blobs
 from sklearn.model_selection import train_test_split
 
 import lightgbm as lgb
@@ -890,3 +890,57 @@ def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Datas
         data=rng.standard_normal(size=(100, 3)),
     )
     assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]
+
+
+# NOTE: this intentionally contains values where num_leaves <, ==, and > (max_depth^2)
+@pytest.mark.parametrize(("max_depth", "num_leaves"), [(-1, 3), (-1, 50), (5, 3), (5, 31), (5, 32), (8, 3), (8, 31)])
+def test_max_depth_warning_is_not_raised_if_num_leaves_is_also_provided(capsys, num_leaves, max_depth):
+    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
+    lgb.Booster(
+        params={
+            "objective": "binary",
+            "max_depth": max_depth,
+            "num_leaves": num_leaves,
+            "num_iterations": 1,
+            "verbose": 0,
+        },
+        train_set=lgb.Dataset(X, label=y),
+    )
+    assert "Provided parameters constrain tree depth" not in capsys.readouterr().out
+
+
+# NOTE: max_depth < 5 is significant here because the default for num_leaves=31. With max_depth=5,
+#       a full depth-wise tree would have 2^5 = 32 leaves.
+@pytest.mark.parametrize("max_depth", [1, 2, 3, 4])
+def test_max_depth_warning_is_not_raised_if_max_depth_gt_1_and_lt_5_and_num_leaves_omitted(capsys, max_depth):
+    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
+    lgb.Booster(
+        params={
+            "objective": "binary",
+            "max_depth": max_depth,
+            "num_iterations": 1,
+            "verbose": 0,
+        },
+        train_set=lgb.Dataset(X, label=y),
+    )
+    assert "Provided parameters constrain tree depth" not in capsys.readouterr().out
+
+
+@pytest.mark.parametrize("max_depth", [5, 6, 7, 8, 9])
+def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(capsys, max_depth):
+    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
+    lgb.Booster(
+        params={
+            "objective": "binary",
+            "max_depth": max_depth,
+            "num_iterations": 1,
+            "verbose": 0,
+        },
+        train_set=lgb.Dataset(X, label=y),
+    )
+    expected_warning = (
+        f"[LightGBM] [Warning] Provided parameters constrain tree depth (max_depth={max_depth}) without explicitly "
+        f"setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<={2**max_depth}) "
+        "in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity."
+    )
+    assert expected_warning in capsys.readouterr().out
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index a995bfcae6b2..b458192a2ee0 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -1276,6 +1276,20 @@ def test_check_is_fitted():
         check_is_fitted(model)
 
 
+@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
+@pytest.mark.parametrize("max_depth", [3, 4, 5, 8])
+def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth):
+    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
+    params = {"n_estimators": 1, "max_depth": max_depth, "verbose": 0}
+    if estimator_class is lgb.LGBMModel:
+        estimator_class(**{**params, "objective": "binary"}).fit(X, y)
+    elif estimator_class is lgb.LGBMRanker:
+        estimator_class(**params).fit(X, y, group=np.ones(X.shape[0]))
+    else:
+        estimator_class(**params).fit(X, y)
+    assert "Provided parameters constrain tree depth" not in capsys.readouterr().out
+
+
 @parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()])
 def test_sklearn_integration(estimator, check):
     estimator.set_params(min_child_samples=1, min_data_in_bin=1)