Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adapt to scikit-learn 1.6 estimator tag changes #11021

Merged
merged 15 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,13 @@ credentials.csv
.bloop

# python tests
*.bin
demo/**/*.txt
*.dmatrix
.hypothesis
__MACOSX/
model*.json
/tests/python/models/models/
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Noticed some model files left behind from running all the Python tests locally while developing this. These .gitignore rules prevent checking them into source control.


# R tests
*.htm
Expand Down
40 changes: 36 additions & 4 deletions python-package/xgboost/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:

# sklearn
try:
from sklearn import __version__ as _sklearn_version
from sklearn.base import BaseEstimator as XGBModelBase
from sklearn.base import ClassifierMixin as XGBClassifierBase
from sklearn.base import RegressorMixin as XGBRegressorBase
Expand All @@ -55,20 +56,51 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
from sklearn.cross_validation import KFold as XGBKFold
from sklearn.cross_validation import StratifiedKFold as XGBStratifiedKFold

# sklearn.utils Tags types can be imported unconditionally once
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can do that once the next sklearn is published.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should.

That'd effectively raise xgboost's requirement all the way to scikit-learn>=1.6.

Because it would result in compat.SKLEARN_INSTALLED being False for scikit-learn < 1.6:

except ImportError:
SKLEARN_INSTALLED = False

Which would make all the estimators unusable on those versions.

if not SKLEARN_INSTALLED:
raise ImportError(
"sklearn needs to be installed in order to use this module"
)

# xgboost's minimum scikit-learn version is 1.6 or higher
try:
from sklearn.utils import ClassifierTags as _sklearn_ClassifierTags
from sklearn.utils import RegressorTags as _sklearn_RegressorTags
from sklearn.utils import Tags as _sklearn_Tags
except ImportError:
_sklearn_ClassifierTags = object
_sklearn_RegressorTags = object
_sklearn_Tags = object

SKLEARN_INSTALLED = True

except ImportError:
SKLEARN_INSTALLED = False

# used for compatibility without sklearn
XGBModelBase = object
XGBClassifierBase = object
XGBRegressorBase = object
LabelEncoder = object
class XGBModelBase: # type: ignore[no-redef]
"""Dummy class for sklearn.base.BaseEstimator."""

pass

class XGBClassifierBase: # type: ignore[no-redef]
"""Dummy class for sklearn.base.ClassifierMixin."""

pass

class XGBRegressorBase: # type: ignore[no-redef]
"""Dummy class for sklearn.base.RegressorMixin."""

pass

class LabelEncoder: # type: ignore[no-redef]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can remove the label encoder for now. It's not used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh great! I just removed that in a511848

Noticed that KFold was also unused, so I removed that as well.

"""Dummy class for sklearn.preprocessing.LabelEncoder."""

pass
Copy link
Contributor Author

@jameslamb jameslamb Nov 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With all of these placeholder classes set to object, using the re-arranged inheritance order (https://github.com/dmlc/xgboost/pull/11021/files#r1857885039) results in errors like this when importing xgboost when scikit-learn is not installed:

TypeError: Cannot create a consistent method resolution order (MRO) for bases object, XGBModel

(build link)

Having each one be a different class resolves that. I forgot until I saw this test failure that we faced a similar thing a few years ago in LightGBM: microsoft/LightGBM#3192

Just copying @StrikerRUS 's solution from there here... it's worked well for lightgbm.


XGBKFold = None
XGBStratifiedKFold = None

_sklearn_ClassifierTags = object
_sklearn_RegressorTags = object
_sklearn_Tags = object
_sklearn_version = object


_logger = logging.getLogger(__name__)

Expand Down
86 changes: 80 additions & 6 deletions python-package/xgboost/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
XGBClassifierBase,
XGBModelBase,
XGBRegressorBase,
_sklearn_ClassifierTags,
_sklearn_RegressorTags,
_sklearn_Tags,
_sklearn_version,
import_cupy,
)
from .config import config_context
Expand Down Expand Up @@ -805,6 +809,41 @@ def _more_tags(self) -> Dict[str, bool]:
tags["non_deterministic"] = True
return tags

@staticmethod
def _update_sklearn_tags_from_dict(
*,
tags: _sklearn_Tags,
tags_dict: Dict[str, bool],
) -> _sklearn_Tags:
"""Update ``sklearn.utils.Tags`` inherited from ``scikit-learn`` base classes.

``scikit-learn`` 1.6 introduced a dataclass-based interface for estimator tags.
ref: https://github.com/scikit-learn/scikit-learn/pull/29677

This method handles updating that instance based on the values in ``self._more_tags()``.
"""
tags.non_deterministic = tags_dict.get("non_deterministic", False)
tags.no_validation = tags_dict["no_validation"]
tags.input_tags.allow_nan = tags_dict["allow_nan"]
return tags

def __sklearn_tags__(self) -> _sklearn_Tags:
# XGBModelBase.__sklearn_tags__() cannot be called unconditionally,
# because that method isn't defined for scikit-learn<1.6
if not hasattr(XGBModelBase, "__sklearn_tags__"):
err_msg = (
"__sklearn_tags__() should not be called when using scikit-learn<1.6. "
f"Detected version: {_sklearn_version}"
)
raise AttributeError(err_msg)

# take whatever tags are provided by BaseEstimator, then modify
# them with XGBoost-specific values
return self._update_sklearn_tags_from_dict(
tags=XGBModelBase.__sklearn_tags__(self), # pylint: disable=no-member
tags_dict=self._more_tags(),
)

def __sklearn_is_fitted__(self) -> bool:
return hasattr(self, "_Booster")

Expand Down Expand Up @@ -898,13 +937,30 @@ def get_params(self, deep: bool = True) -> Dict[str, Any]:
"""Get parameters."""
# Based on: https://stackoverflow.com/questions/59248211
# The basic flow in `get_params` is:
# 0. Return parameters in subclass first, by using inspect.
# 1. Return parameters in `XGBModel` (the base class).
# 0. Return parameters in subclass (self.__class__) first, by using inspect.
# 1. Return parameters in all parent classes (especially `XGBModel`).
# 2. Return whatever in `**kwargs`.
# 3. Merge them.
#
# This needs to accommodate being called recursively in the following
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please help add a test for this? The hierarchy and the Python introspection are getting a bit confusing now. ;-(

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure! I just added one in a511848, let me know if there are other conditions you'd like to see tested.

Between that and the existing test:

def test_parameters_access():

I think this behavior should be well-covered.

# inheritance graphs (and similar for classification and ranking):
#
# XGBRFRegressor -> XGBRegressor -> XGBModel -> BaseEstimator
# XGBRegressor -> XGBModel -> BaseEstimator
# XGBModel -> BaseEstimator
#
params = super().get_params(deep)
cp = copy.copy(self)
cp.__class__ = cp.__class__.__bases__[0]
# if the immediate parent is a mixin, skip it (mixins don't define get_params())
Copy link
Member

@trivialfis trivialfis Dec 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think it's more general to check for the get_params attribute instead of checking hardcoded mixins? The current check seems to defeat the purpose of having a polymorphic structure (inheritance).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that's a good idea! Just pushed 18c602f implementing that.

if cp.__class__.__bases__[0] in (
XGBClassifierBase,
XGBRankerMixIn,
XGBRegressorBase,
):
cp.__class__ = cp.__class__.__bases__[1]
# otherwise, run get_params() from the immediate parent class
else:
cp.__class__ = cp.__class__.__bases__[0]
params.update(cp.__class__.get_params(cp, deep))
# if kwargs is a dict, update params accordingly
if hasattr(self, "kwargs") and isinstance(self.kwargs, dict):
Expand Down Expand Up @@ -1481,7 +1537,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) ->
Number of boosting rounds.
""",
)
class XGBClassifier(XGBModel, XGBClassifierBase):
class XGBClassifier(XGBClassifierBase, XGBModel):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As of scikit-learn/scikit-learn#30234 (which will be in scikit-learn 1.6), the estimator checks raise an error like the following:

XGBRegressor is inheriting from mixins in the wrong order. In general, in mixin inheritance, more specialized mixins must come before more general ones. This means, for instance, BaseEstimator should be on the right side of most other mixins. You need to change the order...

That check is new, but it enforced behavior that's been documented in scikit-learn's estimator development docs for a long time. See the "BaseEstimator and mixins" section in https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator.

It is particularly important to notice that mixins should be “on the left” while the BaseEstimator should be “on the right” in the inheritance list for proper MRO.

That new check error led to these inheritance-order changes, which led to the XGBModel.get_params() changes.

# pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
@_deprecate_positional_args
def __init__(
Expand All @@ -1497,6 +1553,15 @@ def _more_tags(self) -> Dict[str, bool]:
tags["multilabel"] = True
return tags

def __sklearn_tags__(self) -> _sklearn_Tags:
tags = XGBModel.__sklearn_tags__(self)
tags.estimator_type = "classifier"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this if we inherit the classifier mixin?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah you are totally right, I don't think we do:

https://github.com/scikit-learn/scikit-learn/blob/e6037ba412ed889a888a60bd6c022990f2669507/sklearn/base.py#L536-L544

Removed this, the corresponding LGBRegressor code, and the imports of ClassifierTags / RegressorTags in a511848

tags_dict = self._more_tags()
tags.classifier_tags = _sklearn_ClassifierTags(
multi_label=tags_dict["multilabel"]
)
return tags

@_deprecate_positional_args
def fit(
self,
Expand Down Expand Up @@ -1769,7 +1834,7 @@ def fit(
"Implementation of the scikit-learn API for XGBoost regression.",
["estimators", "model", "objective"],
)
class XGBRegressor(XGBModel, XGBRegressorBase):
class XGBRegressor(XGBRegressorBase, XGBModel):
# pylint: disable=missing-docstring
@_deprecate_positional_args
def __init__(
Expand All @@ -1783,6 +1848,15 @@ def _more_tags(self) -> Dict[str, bool]:
tags["multioutput_only"] = False
return tags

def __sklearn_tags__(self) -> _sklearn_Tags:
tags = XGBModel.__sklearn_tags__(self)
tags.estimator_type = "regressor"
tags_dict = self._more_tags()
tags.regressor_tags = _sklearn_RegressorTags()
tags.target_tags.multi_output = tags_dict["multioutput"]
tags.target_tags.single_output = not tags_dict["multioutput_only"]
return tags


@xgboost_model_doc(
"scikit-learn API for XGBoost random forest regression.",
Expand Down Expand Up @@ -1910,7 +1984,7 @@ def _get_qid(
`qid` can be a special column of input `X` instead of a separated parameter, see
:py:meth:`fit` for more info.""",
)
class XGBRanker(XGBModel, XGBRankerMixIn):
class XGBRanker(XGBRankerMixIn, XGBModel):
# pylint: disable=missing-docstring,too-many-arguments,invalid-name
@_deprecate_positional_args
def __init__(self, *, objective: str = "rank:ndcg", **kwargs: Any):
Expand Down
55 changes: 54 additions & 1 deletion tests/python/test_with_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import pickle
import random
import re
import tempfile
import warnings
from typing import Callable, Optional
Expand Down Expand Up @@ -1517,7 +1518,7 @@ def test_tags() -> None:
assert tags["multioutput"] is True
assert tags["multioutput_only"] is False

for clf in [xgb.XGBClassifier()]:
for clf in [xgb.XGBClassifier(), xgb.XGBRFClassifier()]:
tags = clf._more_tags()
assert "multioutput" not in tags
assert tags["multilabel"] is True
Expand All @@ -1526,6 +1527,58 @@ def test_tags() -> None:
assert "multioutput" not in tags


# the try-excepts in this test should be removed once xgboost's
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can use pytest.mark.skipif to skip tests. Seems simpler.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking that it's useful to check that the exact, expected AttributeError is raised...there are so many layers of inheritance involved here, it wouldn't be hard to implement this in a way that accidentally raises some totally unrelated error accessing a non-existent property or something.

If you read that and still think skipif() would be preferable, I'll happy change to that and remove the try-except, just wanted to explain my thinking.

# minimum supported scikit-learn version is at least 1.6
def test_sklearn_tags():

def _assert_has_xgbmodel_tags(tags):
# values set by XGBModel.__sklearn_tags__()
assert tags.non_deterministic is False
assert tags.no_validation is True
assert tags.input_tags.allow_nan is True

for reg in [xgb.XGBRegressor(), xgb.XGBRFRegressor()]:
try:
# if no AttributeError was thrown, we must be using scikit-learn>=1.6,
# and so the actual effects of __sklearn_tags__() should be tested
tags = reg.__sklearn_tags__()
_assert_has_xgbmodel_tags(tags)
# regressor-specific values
assert tags.estimator_type == "regressor"
assert tags.regressor_tags is not None
assert tags.classifier_tags is None
assert tags.target_tags.multi_output is True
assert tags.target_tags.single_output is True
except AttributeError as err:
# only the exact error we expected to be raised should be raised
assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err)))

for clf in [xgb.XGBClassifier(), xgb.XGBRFClassifier()]:
try:
# if no AttributeError was thrown, we must be using scikit-learn>=1.6,
# and so the actual effects of __sklearn_tags__() should be tested
tags = clf.__sklearn_tags__()
_assert_has_xgbmodel_tags(tags)
# classifier-specific values
assert tags.estimator_type == "classifier"
assert tags.regressor_tags is None
assert tags.classifier_tags is not None
assert tags.classifier_tags.multi_label is True
except AttributeError as err:
# only the exact error we expected to be raised should be raised
assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err)))

for rnk in [xgb.XGBRanker(),]:
try:
# if no AttributeError was thrown, we must be using scikit-learn>=1.6,
# and so the actual effects of __sklearn_tags__() should be tested
tags = rnk.__sklearn_tags__()
_assert_has_xgbmodel_tags(tags)
except AttributeError as err:
# only the exact error we expected to be raised should be raised
assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err)))


def test_doc_link() -> None:
for est in [
xgb.XGBRegressor(),
Expand Down
Loading