From fed9115e18c17d76e1aec35ed0fbc99aec272611 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sun, 8 Dec 2024 21:55:04 +0100
Subject: [PATCH 01/10] MAINT use sklearn_compat for multi-version scikit-learn
 compatibilities

---
 python-package/lightgbm/compat.py         |  98 +---
 python-package/lightgbm/sklearn_compat.py | 643 ++++++++++++++++++++++
 2 files changed, 649 insertions(+), 92 deletions(-)
 create mode 100644 python-package/lightgbm/sklearn_compat.py

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 96dee6522572..9a3f3a248b6a 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -12,91 +12,11 @@
     from sklearn.preprocessing import LabelEncoder
     from sklearn.utils.class_weight import compute_sample_weight
     from sklearn.utils.multiclass import check_classification_targets
-    from sklearn.utils.validation import assert_all_finite, check_array, check_X_y
-
-    try:
-        from sklearn.exceptions import NotFittedError
-        from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold
-    except ImportError:
-        from sklearn.cross_validation import BaseCrossValidator, GroupKFold, StratifiedKFold
-        from sklearn.utils.validation import NotFittedError
-    try:
-        from sklearn.utils.validation import _check_sample_weight
-    except ImportError:
-        from sklearn.utils.validation import check_consistent_length
-
-        # dummy function to support older version of scikit-learn
-        def _check_sample_weight(sample_weight: Any, X: Any, dtype: Any = None) -> Any:
-            check_consistent_length(sample_weight, X)
-            return sample_weight
-
-    try:
-        from sklearn.utils.validation import validate_data
-    except ImportError:
-        # validate_data() was added in scikit-learn 1.6, this function roughly imitates it for older versions.
-        # It can be removed when lightgbm's minimum scikit-learn version is at least 1.6.
-        def validate_data(
-            _estimator,
-            X,
-            y="no_validation",
-            accept_sparse: bool = True,
-            # 'force_all_finite' was renamed to 'ensure_all_finite' in scikit-learn 1.6
-            ensure_all_finite: bool = False,
-            ensure_min_samples: int = 1,
-            # trap other keyword arguments that only work on scikit-learn >=1.6, like 'reset'
-            **ignored_kwargs,
-        ):
-            # it's safe to import _num_features unconditionally because:
-            #
-            #  * it was first added in scikit-learn 0.24.2
-            #  * lightgbm cannot be used with scikit-learn versions older than that
-            #  * this validate_data() re-implementation will not be called in scikit-learn>=1.6
-            #
-            from sklearn.utils.validation import _num_features
-
-            # _num_features() raises a TypeError on 1-dimensional input. That's a problem
-            # because scikit-learn's 'check_fit1d' estimator check sets that expectation that
-            # estimators must raise a ValueError when a 1-dimensional input is passed to fit().
-            #
-            # So here, lightgbm avoids calling _num_features() on 1-dimensional inputs.
-            if hasattr(X, "shape") and len(X.shape) == 1:
-                n_features_in_ = 1
-            else:
-                n_features_in_ = _num_features(X)
-
-            no_val_y = isinstance(y, str) and y == "no_validation"
-
-            # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here
-            if no_val_y:
-                X = check_array(
-                    X,
-                    accept_sparse=accept_sparse,
-                    force_all_finite=ensure_all_finite,
-                    ensure_min_samples=ensure_min_samples,
-                )
-            else:
-                X, y = check_X_y(
-                    X,
-                    y,
-                    accept_sparse=accept_sparse,
-                    force_all_finite=ensure_all_finite,
-                    ensure_min_samples=ensure_min_samples,
-                )
-
-                # this only needs to be updated at fit() time
-                _estimator.n_features_in_ = n_features_in_
-
-            # raise the same error that scikit-learn's `validate_data()` does on scikit-learn>=1.6
-            if _estimator.__sklearn_is_fitted__() and _estimator._n_features != n_features_in_:
-                raise ValueError(
-                    f"X has {n_features_in_} features, but {_estimator.__class__.__name__} "
-                    f"is expecting {_estimator._n_features} features as input."
-                )
-
-            if no_val_y:
-                return X
-            else:
-                return X, y
+    from sklearn.utils.validation import assert_all_finite
+    from sklearn.exceptions import NotFittedError
+    from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold
+    from sklearn.utils.validation import _check_sample_weight
+    from .sklearn_compat.utils.validation import validate_data
 
     SKLEARN_INSTALLED = True
     _LGBMBaseCrossValidator = BaseCrossValidator
@@ -144,13 +64,7 @@ class _LGBMRegressorBase:  # type: ignore
 
 # additional scikit-learn imports only for type hints
 if TYPE_CHECKING:
-    # sklearn.utils.Tags can be imported unconditionally once
-    # lightgbm's minimum scikit-learn version is 1.6 or higher
-    try:
-        from sklearn.utils import Tags as _sklearn_Tags
-    except ImportError:
-        _sklearn_Tags = None
-
+    from .sklearn_compat.utils import Tags as _sklearn_Tags
 
 """pandas"""
 try:
diff --git a/python-package/lightgbm/sklearn_compat.py b/python-package/lightgbm/sklearn_compat.py
new file mode 100644
index 000000000000..9ce6792203ce
--- /dev/null
+++ b/python-package/lightgbm/sklearn_compat.py
@@ -0,0 +1,643 @@
+"""Ease developer experience to support multiple versions of scikit-learn.
+
+This file is intended to be vendored in your project if you do not want to depend on
+`sklearn-compat` as a package. Then, you can import directly from this file.
+
+Be aware that depending on `sklearn-compat` does not add any additional dependencies:
+we are only depending on `scikit-learn`.
+
+Version: 0.1.0
+"""
+
+from __future__ import annotations
+
+import platform
+import sys
+from dataclasses import dataclass, field
+from typing import Callable, Literal
+
+import sklearn
+from sklearn.utils._param_validation import validate_parameter_constraints
+from sklearn.utils.fixes import parse_version
+
+sklearn_version = parse_version(parse_version(sklearn.__version__).base_version)
+
+
+########################################################################################
+# The following code does not depend on the sklearn version
+########################################################################################
+
+
+# parameters validation
+class ParamsValidationMixin:
+    """Mixin class to validate parameters."""
+
+    def _validate_params(self):
+        """Validate types and values of constructor parameters.
+
+        The expected type and values must be defined in the `_parameter_constraints`
+        class attribute, which is a dictionary `param_name: list of constraints`. See
+        the docstring of `validate_parameter_constraints` for a description of the
+        accepted constraints.
+        """
+        if hasattr(self, "_parameter_constraints"):
+            validate_parameter_constraints(
+                self._parameter_constraints,
+                self.get_params(deep=False),
+                caller_name=self.__class__.__name__,
+            )
+
+
+# tags infrastructure
+def _dataclass_args():
+    if sys.version_info < (3, 10):
+        return {}
+    return {"slots": True}
+
+
+def get_tags(estimator):
+    """Get estimator tags in a consistent format across different sklearn versions.
+
+    This function provides compatibility between sklearn versions before and after 1.6.
+    It returns either a Tags object (sklearn >= 1.6) or a converted Tags object from
+    the dictionary format (sklearn < 1.6) containing metadata about the estimator's
+    requirements and capabilities.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        A scikit-learn estimator instance.
+
+    Returns
+    -------
+    tags : Tags
+        An object containing metadata about the estimator's requirements and
+        capabilities (e.g., input types, fitting requirements, classifier/regressor
+        specific tags).
+    """
+    try:
+        from sklearn.utils._tags import get_tags
+
+        return get_tags(estimator)
+    except ImportError:
+        from sklearn.utils._tags import _safe_tags
+
+        return _to_new_tags(_safe_tags(estimator), estimator)
+
+
+def _to_new_tags(old_tags, estimator=None):
+    """Utility function convert old tags (dictionary) to new tags (dataclass)."""
+    input_tags = InputTags(
+        one_d_array="1darray" in old_tags["X_types"],
+        two_d_array="2darray" in old_tags["X_types"],
+        three_d_array="3darray" in old_tags["X_types"],
+        sparse="sparse" in old_tags["X_types"],
+        categorical="categorical" in old_tags["X_types"],
+        string="string" in old_tags["X_types"],
+        dict="dict" in old_tags["X_types"],
+        positive_only=old_tags["requires_positive_X"],
+        allow_nan=old_tags["allow_nan"],
+        pairwise=old_tags["pairwise"],
+    )
+    target_tags = TargetTags(
+        required=old_tags["requires_y"],
+        one_d_labels="1dlabels" in old_tags["X_types"],
+        two_d_labels="2dlabels" in old_tags["X_types"],
+        positive_only=old_tags["requires_positive_y"],
+        multi_output=old_tags["multioutput"] or old_tags["multioutput_only"],
+        single_output=not old_tags["multioutput_only"],
+    )
+    if estimator is not None and (
+        hasattr(estimator, "transform") or hasattr(estimator, "fit_transform")
+    ):
+        transformer_tags = TransformerTags(
+            preserves_dtype=old_tags["preserves_dtype"],
+        )
+    else:
+        transformer_tags = None
+    estimator_type = getattr(estimator, "_estimator_type", None)
+    if estimator_type == "classifier":
+        classifier_tags = ClassifierTags(
+            poor_score=old_tags["poor_score"],
+            multi_class=not old_tags["binary_only"],
+            multi_label=old_tags["multilabel"],
+        )
+    else:
+        classifier_tags = None
+    if estimator_type == "regressor":
+        regressor_tags = RegressorTags(
+            poor_score=old_tags["poor_score"],
+            multi_label=old_tags["multilabel"],
+        )
+    else:
+        regressor_tags = None
+    return Tags(
+        estimator_type=estimator_type,
+        target_tags=target_tags,
+        transformer_tags=transformer_tags,
+        classifier_tags=classifier_tags,
+        regressor_tags=regressor_tags,
+        input_tags=input_tags,
+        array_api_support=old_tags["array_api_support"],
+        no_validation=old_tags["no_validation"],
+        non_deterministic=old_tags["non_deterministic"],
+        requires_fit=old_tags["requires_fit"],
+        _skip_test=old_tags["_skip_test"],
+    )
+
+
+########################################################################################
+# Upgrading for scikit-learn 1.4
+########################################################################################
+
+
+if sklearn_version < parse_version("1.4"):
+
+    def _is_fitted(estimator, attributes=None, all_or_any=all):
+        """Determine if an estimator is fitted
+
+        Parameters
+        ----------
+        estimator : estimator instance
+            Estimator instance for which the check is performed.
+
+        attributes : str, list or tuple of str, default=None
+            Attribute name(s) given as string or a list/tuple of strings
+            Eg.: ``["coef_", "estimator_", ...], "coef_"``
+
+            If `None`, `estimator` is considered fitted if there exist an
+            attribute that ends with a underscore and does not start with double
+            underscore.
+
+        all_or_any : callable, {all, any}, default=all
+            Specify whether all or any of the given attributes must exist.
+
+        Returns
+        -------
+        fitted : bool
+            Whether the estimator is fitted.
+        """
+        if attributes is not None:
+            if not isinstance(attributes, (list, tuple)):
+                attributes = [attributes]
+            return all_or_any([hasattr(estimator, attr) for attr in attributes])
+
+        if hasattr(estimator, "__sklearn_is_fitted__"):
+            return estimator.__sklearn_is_fitted__()
+
+        fitted_attrs = [
+            v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
+        ]
+        return len(fitted_attrs) > 0
+
+else:
+    from sklearn.utils.validation import _is_fitted  # noqa: F401
+
+
+########################################################################################
+# Upgrading for scikit-learn 1.5
+########################################################################################
+
+
+if sklearn_version < parse_version("1.5"):
+    # chunking
+    # extmath
+    # fixes
+    from sklearn.utils import (
+        _IS_32BIT,  # noqa: F401
+        _approximate_mode,  # noqa: F401
+        _in_unstable_openblas_configuration,  # noqa: F401
+        gen_batches,  # noqa: F401
+        gen_even_slices,  # noqa: F401
+        get_chunk_n_rows,  # noqa: F401
+        safe_sqr,  # noqa: F401
+    )
+    from sklearn.utils import _chunk_generator as chunk_generator  # noqa: F401
+
+    _IS_WASM = platform.machine() in ["wasm32", "wasm64"]
+    # indexing
+    # mask
+    # missing
+    # optional dependencies
+    # user interface
+    # validation
+    from sklearn.utils import (
+        _determine_key_type,  # noqa: F401
+        _get_column_indices,  # noqa: F401
+        _print_elapsed_time,  # noqa: F401
+        _safe_assign,  # noqa: F401
+        _safe_indexing,  # noqa: F401
+        _to_object_array,  # noqa: F401
+        axis0_safe_slice,  # noqa: F401
+        check_matplotlib_support,  # noqa: F401
+        check_pandas_support,  # noqa: F401
+        indices_to_mask,  # noqa: F401
+        is_scalar_nan,  # noqa: F401
+        resample,  # noqa: F401
+        safe_mask,  # noqa: F401
+        shuffle,  # noqa: F401
+    )
+    from sklearn.utils import _is_pandas_na as is_pandas_na  # noqa: F401
+else:
+    # chunking
+    from sklearn.utils._chunking import (
+        chunk_generator,  # noqa: F401
+        gen_batches,  # noqa: F401
+        gen_even_slices,  # noqa: F401
+        get_chunk_n_rows,  # noqa: F401
+    )
+
+    # indexing
+    from sklearn.utils._indexing import (
+        _determine_key_type,  # noqa: F401
+        _get_column_indices,  # noqa: F401
+        _safe_assign,  # noqa: F401
+        _safe_indexing,  # noqa: F401
+        resample,  # noqa: F401
+        shuffle,  # noqa: F401
+    )
+
+    # mask
+    from sklearn.utils._mask import (
+        axis0_safe_slice,  # noqa: F401
+        indices_to_mask,  # noqa: F401
+        safe_mask,  # noqa: F401
+    )
+
+    # missing
+    from sklearn.utils._missing import (
+        is_pandas_na,  # noqa: F401
+        is_scalar_nan,  # noqa: F401
+    )
+
+    # optional dependencies
+    from sklearn.utils._optional_dependencies import (  # noqa: F401
+        check_matplotlib_support,
+        check_pandas_support,  # noqa: F401
+    )
+
+    # user interface
+    from sklearn.utils._user_interface import _print_elapsed_time  # noqa: F401
+
+    # extmath
+    from sklearn.utils.extmath import (
+        _approximate_mode,  # noqa: F401
+        safe_sqr,  # noqa: F401
+    )
+
+    # fixes
+    from sklearn.utils.fixes import (
+        _IS_32BIT,  # noqa: F401
+        _IS_WASM,  # noqa: F401
+        _in_unstable_openblas_configuration,  # noqa: F401
+    )
+
+    # validation
+    from sklearn.utils.validation import _to_object_array  # noqa: F401
+
+########################################################################################
+# Upgrading for scikit-learn 1.6
+########################################################################################
+
+
+if sklearn_version < parse_version("1.6"):
+    # test_common
+    from sklearn.utils.estimator_checks import _construct_instance
+
+    def type_of_target(y, input_name="", *, raise_unknown=False):
+        # fix for raise_unknown which is introduced in scikit-learn 1.6
+        from sklearn.utils.multiclass import type_of_target
+
+        def _raise_or_return(target_type):
+            """Depending on the value of raise_unknown, either raise an error or
+            return 'unknown'.
+            """
+            if raise_unknown and target_type == "unknown":
+                input = input_name if input_name else "data"
+                raise ValueError(f"Unknown label type for {input}: {y!r}")
+            else:
+                return target_type
+
+        target_type = type_of_target(y, input_name=input_name)
+        return _raise_or_return(target_type)
+
+    def _construct_instances(Estimator):
+        yield _construct_instance(Estimator)
+
+    # validation
+    def validate_data(_estimator, /, **kwargs):
+        if "ensure_all_finite" in kwargs:
+            force_all_finite = kwargs.pop("ensure_all_finite")
+        else:
+            force_all_finite = True
+        return _estimator._validate_data(**kwargs, force_all_finite=force_all_finite)
+
+    def _check_n_features(estimator, X, *, reset):
+        return estimator._check_n_features(X, reset=reset)
+
+    def _check_feature_names(estimator, X, *, reset):
+        return estimator._check_feature_names(X, reset=reset)
+
+    # tags infrastructure
+    @dataclass(**_dataclass_args())
+    class InputTags:
+        """Tags for the input data.
+
+        Parameters
+        ----------
+        one_d_array : bool, default=False
+            Whether the input can be a 1D array.
+
+        two_d_array : bool, default=True
+            Whether the input can be a 2D array. Note that most common
+            tests currently run only if this flag is set to ``True``.
+
+        three_d_array : bool, default=False
+            Whether the input can be a 3D array.
+
+        sparse : bool, default=False
+            Whether the input can be a sparse matrix.
+
+        categorical : bool, default=False
+            Whether the input can be categorical.
+
+        string : bool, default=False
+            Whether the input can be an array-like of strings.
+
+        dict : bool, default=False
+            Whether the input can be a dictionary.
+
+        positive_only : bool, default=False
+            Whether the estimator requires positive X.
+
+        allow_nan : bool, default=False
+            Whether the estimator supports data with missing values encoded as `np.nan`.
+
+        pairwise : bool, default=False
+            This boolean attribute indicates whether the data (`X`),
+            :term:`fit` and similar methods consists of pairwise measures
+            over samples rather than a feature representation for each
+            sample.  It is usually `True` where an estimator has a
+            `metric` or `affinity` or `kernel` parameter with value
+            'precomputed'. Its primary purpose is to support a
+            :term:`meta-estimator` or a cross validation procedure that
+            extracts a sub-sample of data intended for a pairwise
+            estimator, where the data needs to be indexed on both axes.
+            Specifically, this tag is used by
+            `sklearn.utils.metaestimators._safe_split` to slice rows and
+            columns.
+        """
+
+        one_d_array: bool = False
+        two_d_array: bool = True
+        three_d_array: bool = False
+        sparse: bool = False
+        categorical: bool = False
+        string: bool = False
+        dict: bool = False
+        positive_only: bool = False
+        allow_nan: bool = False
+        pairwise: bool = False
+
+    @dataclass(**_dataclass_args())
+    class TargetTags:
+        """Tags for the target data.
+
+        Parameters
+        ----------
+        required : bool
+            Whether the estimator requires y to be passed to `fit`,
+            `fit_predict` or `fit_transform` methods. The tag is ``True``
+            for estimators inheriting from `~sklearn.base.RegressorMixin`
+            and `~sklearn.base.ClassifierMixin`.
+
+        one_d_labels : bool, default=False
+            Whether the input is a 1D labels (y).
+
+        two_d_labels : bool, default=False
+            Whether the input is a 2D labels (y).
+
+        positive_only : bool, default=False
+            Whether the estimator requires a positive y (only applicable
+            for regression).
+
+        multi_output : bool, default=False
+            Whether a regressor supports multi-target outputs or a classifier supports
+            multi-class multi-output.
+
+        single_output : bool, default=True
+            Whether the target can be single-output. This can be ``False`` if the
+            estimator supports only multi-output cases.
+        """
+
+        required: bool
+        one_d_labels: bool = False
+        two_d_labels: bool = False
+        positive_only: bool = False
+        multi_output: bool = False
+        single_output: bool = True
+
+    @dataclass(**_dataclass_args())
+    class TransformerTags:
+        """Tags for the transformer.
+
+        Parameters
+        ----------
+        preserves_dtype : list[str], default=["float64"]
+            Applies only on transformers. It corresponds to the data types
+            which will be preserved such that `X_trans.dtype` is the same
+            as `X.dtype` after calling `transformer.transform(X)`. If this
+            list is empty, then the transformer is not expected to
+            preserve the data type. The first value in the list is
+            considered as the default data type, corresponding to the data
+            type of the output when the input data type is not going to be
+            preserved.
+        """
+
+        preserves_dtype: list[str] = field(default_factory=lambda: ["float64"])
+
+    @dataclass(**_dataclass_args())
+    class ClassifierTags:
+        """Tags for the classifier.
+
+        Parameters
+        ----------
+        poor_score : bool, default=False
+            Whether the estimator fails to provide a "reasonable" test-set
+            score, which currently for classification is an accuracy of
+            0.83 on ``make_blobs(n_samples=300, random_state=0)``. The
+            datasets and values are based on current estimators in scikit-learn
+            and might be replaced by something more systematic.
+
+        multi_class : bool, default=True
+            Whether the classifier can handle multi-class
+            classification. Note that all classifiers support binary
+            classification. Therefore this flag indicates whether the
+            classifier is a binary-classifier-only or not.
+
+        multi_label : bool, default=False
+            Whether the classifier supports multi-label output.
+        """
+
+        poor_score: bool = False
+        multi_class: bool = True
+        multi_label: bool = False
+
+    @dataclass(**_dataclass_args())
+    class RegressorTags:
+        """Tags for the regressor.
+
+        Parameters
+        ----------
+        poor_score : bool, default=False
+            Whether the estimator fails to provide a "reasonable" test-set
+            score, which currently for regression is an R2 of 0.5 on
+            ``make_regression(n_samples=200, n_features=10,
+            n_informative=1, bias=5.0, noise=20, random_state=42)``. The
+            dataset and values are based on current estimators in scikit-learn
+            and might be replaced by something more systematic.
+
+        multi_label : bool, default=False
+            Whether the regressor supports multilabel output.
+        """
+
+        poor_score: bool = False
+        multi_label: bool = False
+
+    @dataclass(**_dataclass_args())
+    class Tags:
+        """Tags for the estimator.
+
+        See :ref:`estimator_tags` for more information.
+
+        Parameters
+        ----------
+        estimator_type : str or None
+            The type of the estimator. Can be one of:
+            - "classifier"
+            - "regressor"
+            - "transformer"
+            - "clusterer"
+            - "outlier_detector"
+            - "density_estimator"
+
+        target_tags : :class:`TargetTags`
+            The target(y) tags.
+
+        transformer_tags : :class:`TransformerTags` or None
+            The transformer tags.
+
+        classifier_tags : :class:`ClassifierTags` or None
+            The classifier tags.
+
+        regressor_tags : :class:`RegressorTags` or None
+            The regressor tags.
+
+        array_api_support : bool, default=False
+            Whether the estimator supports Array API compatible inputs.
+
+        no_validation : bool, default=False
+            Whether the estimator skips input-validation. This is only meant for
+            stateless and dummy transformers!
+
+        non_deterministic : bool, default=False
+            Whether the estimator is not deterministic given a fixed ``random_state``.
+
+        requires_fit : bool, default=True
+            Whether the estimator requires to be fitted before calling one of
+            `transform`, `predict`, `predict_proba`, or `decision_function`.
+
+        _skip_test : bool, default=False
+            Whether to skip common tests entirely. Don't use this unless
+            you have a *very good* reason.
+
+        input_tags : :class:`InputTags`
+            The input data(X) tags.
+        """
+
+        estimator_type: str | None
+        target_tags: TargetTags
+        transformer_tags: TransformerTags | None = None
+        classifier_tags: ClassifierTags | None = None
+        regressor_tags: RegressorTags | None = None
+        array_api_support: bool = False
+        no_validation: bool = False
+        non_deterministic: bool = False
+        requires_fit: bool = True
+        _skip_test: bool = False
+        input_tags: InputTags = field(default_factory=InputTags)
+
+    def _patched_more_tags(estimator, expected_failed_checks):
+        import copy
+
+        from sklearn.utils._tags import _safe_tags
+
+        original_tags = copy.deepcopy(_safe_tags(estimator))
+
+        def patched_more_tags(self):
+            original_tags.update({"_xfail_checks": expected_failed_checks})
+            return original_tags
+
+        estimator.__class__._more_tags = patched_more_tags
+        return estimator
+
+    def check_estimator(
+        estimator=None,
+        generate_only=False,
+        *,
+        legacy: bool = True,
+        expected_failed_checks: dict[str, str] | None = None,
+        on_skip: Literal["warn"] | None = "warn",
+        on_fail: Literal["raise", "warn"] | None = "raise",
+        callback: Callable | None = None,
+    ):
+        # legacy, on_skip, on_fail, and callback are not supported and ignored
+        from sklearn.utils.estimator_checks import check_estimator
+
+        return check_estimator(
+            _patched_more_tags(estimator, expected_failed_checks),
+            generate_only=generate_only,
+        )
+
+    def parametrize_with_checks(
+        estimators,
+        *,
+        legacy: bool = True,
+        expected_failed_checks: Callable | None = None,
+    ):
+        # legacy is not supported and ignored
+        from sklearn.utils.estimator_checks import parametrize_with_checks
+
+        estimators = [
+            _patched_more_tags(estimator, expected_failed_checks(estimator))
+            for estimator in estimators
+        ]
+
+        return parametrize_with_checks(estimators)
+
+else:
+    # test_common
+    # tags infrastructure
+    from sklearn.utils import (
+        ClassifierTags,
+        InputTags,
+        RegressorTags,
+        Tags,
+        TargetTags,
+        TransformerTags,
+    )
+    from sklearn.utils._test_common.instance_generator import (
+        _construct_instances,  # noqa: F401
+    )
+    from sklearn.utils.estimator_checks import (
+        check_estimator,  # noqa: F401
+        parametrize_with_checks,  # noqa: F401
+    )
+    from sklearn.utils.multiclass import type_of_target  # noqa: F401
+
+    # validation
+    from sklearn.utils.validation import (
+        _check_feature_names,  # noqa: F401
+        _check_n_features,  # noqa: F401
+        validate_data,  # noqa: F401
+    )

From 2dc7ebc177fc7604c3475b3ecc14298c9fef4d59 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sun, 8 Dec 2024 22:26:32 +0100
Subject: [PATCH 02/10] compat python 3.7

---
 python-package/lightgbm/sklearn_compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/lightgbm/sklearn_compat.py b/python-package/lightgbm/sklearn_compat.py
index 9ce6792203ce..e79ce2511900 100644
--- a/python-package/lightgbm/sklearn_compat.py
+++ b/python-package/lightgbm/sklearn_compat.py
@@ -325,7 +325,7 @@ def _construct_instances(Estimator):
         yield _construct_instance(Estimator)
 
     # validation
-    def validate_data(_estimator, /, **kwargs):
+    def validate_data(_estimator, **kwargs):
         if "ensure_all_finite" in kwargs:
             force_all_finite = kwargs.pop("ensure_all_finite")
         else:

From 40952803b7d9e51ba7315eed085270f71b3becab Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sun, 8 Dec 2024 22:34:24 +0100
Subject: [PATCH 03/10] fixing lint

---
 python-package/lightgbm/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 9a3f3a248b6a..c37c0207110a 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -64,7 +64,7 @@ class _LGBMRegressorBase:  # type: ignore
 
 # additional scikit-learn imports only for type hints
 if TYPE_CHECKING:
-    from .sklearn_compat.utils import Tags as _sklearn_Tags
+    from .sklearn_compat.utils import Tags as _sklearn_Tags  # noqa: F401
 
 """pandas"""
 try:

From 2585bdd623d15ff42233891b5b28ea72da8f706f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sun, 8 Dec 2024 22:34:40 +0100
Subject: [PATCH 04/10] fixing lint

---
 python-package/pyproject.toml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index 19866e01202b..76e28bdf61df 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -185,6 +185,13 @@ select = [
     # (pylint) Using the global statement is discouraged
     "PLW0603"
 ]
+"python-package/lightgbm/sklearn_compat.py" = [
+    # file is vendored from sklearn-compat
+    "D401",
+    "D400",
+    "D103",
+    "D205",
+]
 "tests/*" = [
     # (flake8-bugbear) Found useless expression
     "B018",

From 3c2f26f6af679a06cf6f99508ead452aa6b8ceda Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sun, 8 Dec 2024 22:40:15 +0100
Subject: [PATCH 05/10] debug

---
 python-package/lightgbm/compat.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index c37c0207110a..d0420ed9a543 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -33,6 +33,7 @@
     _LGBMComputeSampleWeight = compute_sample_weight
     _LGBMValidateData = validate_data
 except ImportError:
+    raise
     SKLEARN_INSTALLED = False
 
     class _LGBMModelBase:  # type: ignore

From 37f209ee6455f2bff9cc2ca558747b5c4532e1e5 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sun, 8 Dec 2024 22:47:42 +0100
Subject: [PATCH 06/10] use vendoring import

---
 .../lightgbm/{sklearn_compat.py => _sklearn_compat.py}        | 1 +
 python-package/lightgbm/compat.py                             | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)
 rename python-package/lightgbm/{sklearn_compat.py => _sklearn_compat.py} (99%)

diff --git a/python-package/lightgbm/sklearn_compat.py b/python-package/lightgbm/_sklearn_compat.py
similarity index 99%
rename from python-package/lightgbm/sklearn_compat.py
rename to python-package/lightgbm/_sklearn_compat.py
index e79ce2511900..c1414f1daf5d 100644
--- a/python-package/lightgbm/sklearn_compat.py
+++ b/python-package/lightgbm/_sklearn_compat.py
@@ -1,3 +1,4 @@
+# coding: utf-8
 """Ease developer experience to support multiple versions of scikit-learn.
 
 This file is intended to be vendored in your project if you do not want to depend on
diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index d0420ed9a543..45650a93f1a5 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -16,7 +16,7 @@
     from sklearn.exceptions import NotFittedError
     from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold
     from sklearn.utils.validation import _check_sample_weight
-    from .sklearn_compat.utils.validation import validate_data
+    from ._sklearn_compat import validate_data
 
     SKLEARN_INSTALLED = True
     _LGBMBaseCrossValidator = BaseCrossValidator
@@ -65,7 +65,7 @@ class _LGBMRegressorBase:  # type: ignore
 
 # additional scikit-learn imports only for type hints
 if TYPE_CHECKING:
-    from .sklearn_compat.utils import Tags as _sklearn_Tags  # noqa: F401
+    from ._sklearn_compat import Tags as _sklearn_Tags  # noqa: F401
 
 """pandas"""
 try:

From 2a658a8f34e725fa81402420f516f995976842f7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sun, 8 Dec 2024 22:48:11 +0100
Subject: [PATCH 07/10] remove debug

---
 python-package/lightgbm/compat.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 45650a93f1a5..377d1fae47c5 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -33,7 +33,6 @@
     _LGBMComputeSampleWeight = compute_sample_weight
     _LGBMValidateData = validate_data
 except ImportError:
-    raise
     SKLEARN_INSTALLED = False
 
     class _LGBMModelBase:  # type: ignore

From 806b9fd22d90b0a73ebd83d9b211083e90213a1f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sun, 8 Dec 2024 22:55:51 +0100
Subject: [PATCH 08/10] fix

---
 python-package/lightgbm/sklearn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 108ef1e14498..bdb9224a3bdf 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -919,8 +919,8 @@ def fit(
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):
             _X, _y = _LGBMValidateData(
                 self,
-                X,
-                y,
+                X=X,
+                y=y,
                 reset=True,
                 # allow any input type (this validation is done further down, in lgb.Dataset())
                 accept_sparse=True,
@@ -1078,7 +1078,7 @@ def predict(
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):
             X = _LGBMValidateData(
                 self,
-                X,
+                X=X,
                 # 'y' being omitted = run scikit-learn's check_array() instead of check_X_y()
                 #
                 # Prevent scikit-learn from deleting or modifying attributes like 'feature_names_in_' and 'n_features_in_'.

From 879aea68b0e9b557d7d817383d3507e44f6442a3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sun, 8 Dec 2024 23:05:04 +0100
Subject: [PATCH 09/10] debug

---
 python-package/lightgbm/compat.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 377d1fae47c5..45650a93f1a5 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -33,6 +33,7 @@
     _LGBMComputeSampleWeight = compute_sample_weight
     _LGBMValidateData = validate_data
 except ImportError:
+    raise
     SKLEARN_INSTALLED = False
 
     class _LGBMModelBase:  # type: ignore

From 4e4ca84c64c7a7b53ea632fc033b5cba16ed9eb2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Sun, 8 Dec 2024 23:15:22 +0100
Subject: [PATCH 10/10] remove type annotation

---
 python-package/lightgbm/_sklearn_compat.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/python-package/lightgbm/_sklearn_compat.py b/python-package/lightgbm/_sklearn_compat.py
index c1414f1daf5d..1a6da7982102 100644
--- a/python-package/lightgbm/_sklearn_compat.py
+++ b/python-package/lightgbm/_sklearn_compat.py
@@ -15,7 +15,6 @@
 import platform
 import sys
 from dataclasses import dataclass, field
-from typing import Callable, Literal
 
 import sklearn
 from sklearn.utils._param_validation import validate_parameter_constraints
@@ -586,11 +585,11 @@ def check_estimator(
         estimator=None,
         generate_only=False,
         *,
-        legacy: bool = True,
-        expected_failed_checks: dict[str, str] | None = None,
-        on_skip: Literal["warn"] | None = "warn",
-        on_fail: Literal["raise", "warn"] | None = "raise",
-        callback: Callable | None = None,
+        legacy=True,
+        expected_failed_checks=None,
+        on_skip="warn",
+        on_fail="raise",
+        callback=None,
     ):
         # legacy, on_skip, on_fail, and callback are not supported and ignored
         from sklearn.utils.estimator_checks import check_estimator
@@ -603,8 +602,8 @@ def check_estimator(
     def parametrize_with_checks(
         estimators,
         *,
-        legacy: bool = True,
-        expected_failed_checks: Callable | None = None,
+        legacy=True,
+        expected_failed_checks=None,
     ):
         # legacy is not supported and ignored
         from sklearn.utils.estimator_checks import parametrize_with_checks