From 8d5dca2e3a6181ba788f8aa3d7a69d08e9d0ea07 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Thu, 31 Oct 2024 08:48:47 -0500
Subject: [PATCH] [python-package] remove support for passing 'feature_name'
 and 'categorical_feature' through train() and cv() (#6706)

---
 python-package/lightgbm/engine.py        | 69 +-----------------------
 tests/python_package_test/test_engine.py | 51 +++++++++++-------
 2 files changed, 33 insertions(+), 87 deletions(-)

diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py
index 89910599b0ca..dca6b607cdc7 100644
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -3,7 +3,6 @@
 
 import copy
 import json
-import warnings
 from collections import OrderedDict, defaultdict
 from operator import attrgetter
 from pathlib import Path
@@ -15,17 +14,14 @@
 from .basic import (
     Booster,
     Dataset,
-    LGBMDeprecationWarning,
     LightGBMError,
     _choose_param_value,
     _ConfigAliases,
     _InnerPredictor,
     _LGBM_BoosterEvalMethodResultType,
     _LGBM_BoosterEvalMethodResultWithStandardDeviationType,
-    _LGBM_CategoricalFeatureConfiguration,
     _LGBM_CustomObjectiveFunction,
     _LGBM_EvalFunctionResultType,
-    _LGBM_FeatureNameConfiguration,
     _log_warning,
 )
 from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
@@ -54,15 +50,6 @@
 ]
 
 
-def _emit_dataset_kwarg_warning(calling_function: str, argname: str) -> None:
-    msg = (
-        f"Argument '{argname}' to {calling_function}() is deprecated and will be removed in "
-        f"a future release. Set '{argname}' when calling lightgbm.Dataset() instead. "
-        "See https://github.com/microsoft/LightGBM/issues/6435."
-    )
-    warnings.warn(msg, category=LGBMDeprecationWarning, stacklevel=2)
-
-
 def _choose_num_iterations(num_boost_round_kwarg: int, params: Dict[str, Any]) -> Dict[str, Any]:
     """Choose number of boosting rounds.
 
@@ -127,8 +114,6 @@ def train(
     valid_names: Optional[List[str]] = None,
     feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
     init_model: Optional[Union[str, Path, Booster]] = None,
-    feature_name: _LGBM_FeatureNameConfiguration = "auto",
-    categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
     keep_training_booster: bool = False,
     callbacks: Optional[List[Callable]] = None,
 ) -> Booster:
@@ -170,21 +155,6 @@ def train(
         set the ``metric`` parameter to the string ``"None"`` in ``params``.
     init_model : str, pathlib.Path, Booster or None, optional (default=None)
         Filename of LightGBM model or Booster instance used for continue training.
-    feature_name : list of str, or 'auto', optional (default="auto")
-        **Deprecated.** Set ``feature_name`` on ``train_set`` instead.
-        Feature names.
-        If 'auto' and data is pandas DataFrame, data columns names are used.
-    categorical_feature : list of str or int, or 'auto', optional (default="auto")
-        **Deprecated.** Set ``categorical_feature`` on ``train_set`` instead.
-        Categorical features.
-        If list of int, interpreted as indices.
-        If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
-        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
-        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
-        Large values could be memory consuming. Consider using consecutive integers starting from zero.
-        All negative values in categorical features will be treated as missing values.
-        The output cannot be monotonically constrained with respect to a categorical feature.
-        Floating point numbers in categorical features will be rounded towards 0.
     keep_training_booster : bool, optional (default=False)
         Whether the returned Booster will be used to keep training.
         If False, the returned value will be converted into _InnerPredictor before returning.
@@ -233,13 +203,6 @@ def train(
                     f"Item {i} has type '{type(valid_item).__name__}'."
                 )
 
-    # raise deprecation warnings if necessary
-    # ref: https://github.com/microsoft/LightGBM/issues/6435
-    if categorical_feature != "auto":
-        _emit_dataset_kwarg_warning("train", "categorical_feature")
-    if feature_name != "auto":
-        _emit_dataset_kwarg_warning("train", "feature_name")
-
     # create predictor first
     params = copy.deepcopy(params)
     params = _choose_param_value(
@@ -278,9 +241,7 @@ def train(
     else:
         init_iteration = 0
 
-    train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature(
-        categorical_feature
-    )
+    train_set._update_params(params)._set_predictor(predictor)
 
     is_valid_contain_train = False
     train_data_name = "training"
@@ -642,8 +603,6 @@ def cv(
     metrics: Optional[Union[str, List[str]]] = None,
     feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
     init_model: Optional[Union[str, Path, Booster]] = None,
-    feature_name: _LGBM_FeatureNameConfiguration = "auto",
-    categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
     fpreproc: Optional[_LGBM_PreprocFunction] = None,
     seed: int = 0,
     callbacks: Optional[List[Callable]] = None,
@@ -699,21 +658,6 @@ def cv(
         set ``metrics`` to the string ``"None"``.
     init_model : str, pathlib.Path, Booster or None, optional (default=None)
         Filename of LightGBM model or Booster instance used for continue training.
-    feature_name : list of str, or 'auto', optional (default="auto")
-        **Deprecated.** Set ``feature_name`` on ``train_set`` instead.
-        Feature names.
-        If 'auto' and data is pandas DataFrame, data columns names are used.
-    categorical_feature : list of str or int, or 'auto', optional (default="auto")
-        **Deprecated.** Set ``categorical_feature`` on ``train_set`` instead.
-        Categorical features.
-        If list of int, interpreted as indices.
-        If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
-        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
-        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
-        Large values could be memory consuming. Consider using consecutive integers starting from zero.
-        All negative values in categorical features will be treated as missing values.
-        The output cannot be monotonically constrained with respect to a categorical feature.
-        Floating point numbers in categorical features will be rounded towards 0.
     fpreproc : callable or None, optional (default=None)
         Preprocessing function that takes (dtrain, dtest, params)
         and returns transformed versions of those.
@@ -767,13 +711,6 @@ def cv(
     if not isinstance(train_set, Dataset):
         raise TypeError(f"cv() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.")
 
-    # raise deprecation warnings if necessary
-    # ref: https://github.com/microsoft/LightGBM/issues/6435
-    if categorical_feature != "auto":
-        _emit_dataset_kwarg_warning("cv", "categorical_feature")
-    if feature_name != "auto":
-        _emit_dataset_kwarg_warning("cv", "feature_name")
-
     params = copy.deepcopy(params)
     params = _choose_param_value(
         main_param_name="objective",
@@ -818,9 +755,7 @@ def cv(
             params.pop(metric_alias, None)
         params["metric"] = metrics
 
-    train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature(
-        categorical_feature
-    )
+    train_set._update_params(params)._set_predictor(predictor)
 
     results = defaultdict(list)
     cvfolds = _make_n_folds(
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 286f066a3526..9ae471e7f4b9 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1459,7 +1459,7 @@ def test_parameters_are_loaded_from_model_file(tmp_path, capsys, rng):
         ]
     )
     y = rng.uniform(size=(100,))
-    ds = lgb.Dataset(X, y)
+    ds = lgb.Dataset(X, y, categorical_feature=[1, 2])
     params = {
         "bagging_fraction": 0.8,
         "bagging_freq": 2,
@@ -1474,7 +1474,7 @@ def test_parameters_are_loaded_from_model_file(tmp_path, capsys, rng):
         "verbosity": 0,
     }
     model_file = tmp_path / "model.txt"
-    orig_bst = lgb.train(params, ds, num_boost_round=1, categorical_feature=[1, 2])
+    orig_bst = lgb.train(params, ds, num_boost_round=1)
     orig_bst.save_model(model_file)
     with model_file.open("rt") as f:
         model_contents = f.readlines()
@@ -1746,16 +1746,18 @@ def test_pandas_categorical(rng_fixed_seed, tmp_path):
     gbm0 = lgb.train(params, lgb_train, num_boost_round=10)
     pred0 = gbm0.predict(X_test)
     assert lgb_train.categorical_feature == "auto"
-    lgb_train = lgb.Dataset(X, pd.DataFrame(y))  # also test that label can be one-column pd.DataFrame
-    gbm1 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[0])
+    lgb_train = lgb.Dataset(
+        X, pd.DataFrame(y), categorical_feature=[0]
+    )  # also test that label can be one-column pd.DataFrame
+    gbm1 = lgb.train(params, lgb_train, num_boost_round=10)
     pred1 = gbm1.predict(X_test)
     assert lgb_train.categorical_feature == [0]
-    lgb_train = lgb.Dataset(X, pd.Series(y))  # also test that label can be pd.Series
-    gbm2 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=["A"])
+    lgb_train = lgb.Dataset(X, pd.Series(y), categorical_feature=["A"])  # also test that label can be pd.Series
+    gbm2 = lgb.train(params, lgb_train, num_boost_round=10)
     pred2 = gbm2.predict(X_test)
     assert lgb_train.categorical_feature == ["A"]
-    lgb_train = lgb.Dataset(X, y)
-    gbm3 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=["A", "B", "C", "D"])
+    lgb_train = lgb.Dataset(X, y, categorical_feature=["A", "B", "C", "D"])
+    gbm3 = lgb.train(params, lgb_train, num_boost_round=10)
     pred3 = gbm3.predict(X_test)
     assert lgb_train.categorical_feature == ["A", "B", "C", "D"]
     categorical_model_path = tmp_path / "categorical.model"
@@ -1767,12 +1769,12 @@ def test_pandas_categorical(rng_fixed_seed, tmp_path):
     pred5 = gbm4.predict(X_test)
     gbm5 = lgb.Booster(model_str=model_str)
     pred6 = gbm5.predict(X_test)
-    lgb_train = lgb.Dataset(X, y)
-    gbm6 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=["A", "B", "C", "D", "E"])
+    lgb_train = lgb.Dataset(X, y, categorical_feature=["A", "B", "C", "D", "E"])
+    gbm6 = lgb.train(params, lgb_train, num_boost_round=10)
     pred7 = gbm6.predict(X_test)
     assert lgb_train.categorical_feature == ["A", "B", "C", "D", "E"]
-    lgb_train = lgb.Dataset(X, y)
-    gbm7 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[])
+    lgb_train = lgb.Dataset(X, y, categorical_feature=[])
+    gbm7 = lgb.train(params, lgb_train, num_boost_round=10)
     pred8 = gbm7.predict(X_test)
     assert lgb_train.categorical_feature == []
     with pytest.raises(AssertionError):
@@ -3672,12 +3674,11 @@ def test_linear_trees(tmp_path, rng_fixed_seed):
     # test with a categorical feature
     x[:250, 0] = 0
     y[:250] += 10
-    lgb_train = lgb.Dataset(x, label=y)
+    lgb_train = lgb.Dataset(x, label=y, categorical_feature=[0])
     est = lgb.train(
         dict(params, linear_tree=True, subsample=0.8, bagging_freq=1),
         lgb_train,
         num_boost_round=10,
-        categorical_feature=[0],
     )
     # test refit: same results on same data
     est2 = est.refit(x, label=y)
@@ -3700,10 +3701,20 @@ def test_linear_trees(tmp_path, rng_fixed_seed):
     # test when num_leaves - 1 < num_features and when num_leaves - 1 > num_features
     X_train, _, y_train, _ = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2)
     params = {"linear_tree": True, "verbose": -1, "metric": "mse", "seed": 0}
-    train_data = lgb.Dataset(X_train, label=y_train, params=dict(params, num_leaves=2))
-    est = lgb.train(params, train_data, num_boost_round=10, categorical_feature=[0])
-    train_data = lgb.Dataset(X_train, label=y_train, params=dict(params, num_leaves=60))
-    est = lgb.train(params, train_data, num_boost_round=10, categorical_feature=[0])
+    train_data = lgb.Dataset(
+        X_train,
+        label=y_train,
+        params=dict(params, num_leaves=2),
+        categorical_feature=[0],
+    )
+    est = lgb.train(params, train_data, num_boost_round=10)
+    train_data = lgb.Dataset(
+        X_train,
+        label=y_train,
+        params=dict(params, num_leaves=60),
+        categorical_feature=[0],
+    )
+    est = lgb.train(params, train_data, num_boost_round=10)
 
 
 def test_save_and_load_linear(tmp_path):
@@ -3714,8 +3725,8 @@ def test_save_and_load_linear(tmp_path):
     X_train[: X_train.shape[0] // 2, 0] = 0
     y_train[: X_train.shape[0] // 2] = 1
     params = {"linear_tree": True}
-    train_data_1 = lgb.Dataset(X_train, label=y_train, params=params)
-    est_1 = lgb.train(params, train_data_1, num_boost_round=10, categorical_feature=[0])
+    train_data_1 = lgb.Dataset(X_train, label=y_train, params=params, categorical_feature=[0])
+    est_1 = lgb.train(params, train_data_1, num_boost_round=10)
     pred_1 = est_1.predict(X_train)
 
     tmp_dataset = str(tmp_path / "temp_dataset.bin")