diff --git a/.ci/install-old-r-packages.R b/.ci/install-old-r-packages.R new file mode 100644 index 000000000000..e402c4d5ca12 --- /dev/null +++ b/.ci/install-old-r-packages.R @@ -0,0 +1,79 @@ +# [description] +# +# Installs a pinned set of packages that worked together +# as of the last R 3.6 release. +# + +.install_packages <- function(packages) { + install.packages( # nolint: undesirable_function + pkgs = paste( # nolint: paste + "https://cran.r-project.org/src/contrib/Archive" + , packages + , sep = "/" + ) + , dependencies = FALSE + , lib = Sys.getenv("R_LIBS") + , repos = NULL + ) +} + +# when confronted with a bunch of URLs like this, install.packages() sometimes +# struggles to determine install order... so install packages in batches here, +# starting from the root of the dependency graph and working up + +# there was only a single release of {praise}, so there is no contrib/Archive URL for it +install.packages( # nolint: undesirable_function + pkgs = "https://cran.r-project.org/src/contrib/praise_1.0.0.tar.gz" + , dependencies = FALSE + , lib = Sys.getenv("R_LIBS") + , repos = NULL +) + +.install_packages(c( + "brio/brio_1.1.4.tar.gz" # nolint: non_portable_path + , "cli/cli_3.6.2.tar.gz" # nolint: non_portable_path + , "crayon/crayon_1.5.2.tar.gz" # nolint: non_portable_path + , "digest/digest_0.6.36.tar.gz" # nolint: non_portable_path + , "evaluate/evaluate_0.23.tar.gz" # nolint: non_portable_path + , "fansi/fansi_1.0.5.tar.gz" # nolint: non_portable_path + , "fs/fs_1.6.4.tar.gz" # nolint: non_portable_path + , "glue/glue_1.7.0.tar.gz" # nolint: non_portable_path + , "jsonlite/jsonlite_1.8.8.tar.gz" # nolint: non_portable_path + , "lattice/lattice_0.20-41.tar.gz" # nolint: non_portable_path + , "magrittr/magrittr_2.0.2.tar.gz" # nolint: non_portable_path + , "pkgconfig/pkgconfig_2.0.2.tar.gz" # nolint: non_portable_path + , "ps/ps_1.8.0.tar.gz" # nolint: non_portable_path + , "R6/R6_2.5.0.tar.gz" # nolint: non_portable_path + , "rlang/rlang_1.1.3.tar.gz" # nolint: non_portable_path + , "rprojroot/rprojroot_2.0.3.tar.gz" # nolint: non_portable_path + , "utf8/utf8_1.2.3.tar.gz" # nolint: non_portable_path + , "withr/withr_3.0.1.tar.gz" # nolint: non_portable_path +)) + +.install_packages(c( + "desc/desc_1.4.2.tar.gz" # nolint: non_portable_path + , "diffobj/diffobj_0.3.4.tar.gz" # nolint: non_portable_path + , "lifecycle/lifecycle_1.0.3.tar.gz" # nolint: non_portable_path + , "processx/processx_3.8.3.tar.gz" # nolint: non_portable_path +)) + +.install_packages(c( + "callr/callr_3.7.5.tar.gz" # nolint: non_portable_path + , "vctrs/vctrs_0.6.4.tar.gz" # nolint: non_portable_path +)) + +.install_packages(c( + "pillar/pillar_1.8.1.tar.gz" # nolint: non_portable_path + , "tibble/tibble_3.2.0.tar.gz" # nolint: non_portable_path +)) + +.install_packages(c( + "pkgbuild/pkgbuild_1.4.4.tar.gz" # nolint: non_portable_path + , "rematch2/rematch2_2.1.1.tar.gz" # nolint: non_portable_path + , "waldo/waldo_0.5.3.tar.gz" # nolint: non_portable_path +)) + +.install_packages(c( + "pkgload/pkgload_1.3.4.tar.gz" # nolint: non_portable_path + , "testthat/testthat_3.2.1.tar.gz" # nolint: non_portable_path +)) diff --git a/.ci/test-r-package.sh b/.ci/test-r-package.sh index ae205213d787..a076fab0186c 100755 --- a/.ci/test-r-package.sh +++ b/.ci/test-r-package.sh @@ -108,10 +108,10 @@ if [[ $OS_NAME == "macos" ]]; then export R_TIDYCMD=/usr/local/bin/tidy fi -# fix for issue where CRAN was not returning {lattice} and {evaluate} when using R 3.6 +# fix for issue where CRAN was not returning {evaluate}, {lattice}, or {waldo} when using R 3.6 # "Warning: dependency ‘lattice’ is not available" if [[ "${R_MAJOR_VERSION}" == "3" ]]; then - Rscript --vanilla -e "install.packages(c('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', 'https://cran.r-project.org/src/contrib/Archive/evaluate/evaluate_0.23.tar.gz'), repos = NULL, lib = '${R_LIB_PATH}')" + Rscript --vanilla ./.ci/install-old-r-packages.R else # {Matrix} needs {lattice}, so this needs to run before manually installing {Matrix}. # This should be unnecessary on R >=4.4.0 diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml index 1758583ad8e4..8811f53b61c0 100644 --- a/.github/workflows/r_package.yml +++ b/.github/workflows/r_package.yml @@ -274,6 +274,7 @@ jobs: - clang19 - gcc14 - intel + - rchk runs-on: ubuntu-latest container: ghcr.io/r-hub/containers/${{ matrix.image }}:latest steps: @@ -311,8 +312,32 @@ jobs: - name: Install packages and run tests shell: bash run: | - Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'markdown', 'Matrix', 'RhpcBLASctl', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" + Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'markdown', 'Matrix', 'RhpcBLASctl'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" sh build-cran-package.sh + + # 'rchk' isn't run through 'R CMD check', use the approach documented at + # https://r-hub.github.io/containers/local.html + if [[ "${{ matrix.image }}" =~ "rchk" ]]; then + r-check "$(pwd)" \ + | tee ./rchk-logs.txt 2>&1 + + # the '-v' exceptions below are from R/rchk itself and not LightGBM: + # https://github.com/kalibera/rchk/issues/22#issuecomment-656036156 + if grep -E '\[PB\]|ERROR' ./rchk-logs.txt \ + | grep -v 'too many states' \ + > /dev/null; \ + then + echo "rchk found issues" + exit 1 + else + echo "rchk did not find any issues" + exit 0 + fi + fi + + # 'testthat' is not needed by 'rchk', so avoid installing it until here + Rscript -e "install.packages('testthat', repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" + if [[ "${{ matrix.image }}" =~ "clang" ]]; then # allowing the following NOTEs (produced by default in the clang images): # diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 96dee6522572..0b9444b0ecbf 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -14,6 +14,14 @@ from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import assert_all_finite, check_array, check_X_y + # sklearn.utils Tags types can be imported unconditionally once + # lightgbm's minimum scikit-learn version is 1.6 or higher + try: + from sklearn.utils import ClassifierTags as _sklearn_ClassifierTags + from sklearn.utils import RegressorTags as _sklearn_RegressorTags + except ImportError: + _sklearn_ClassifierTags = None + _sklearn_RegressorTags = None try: from sklearn.exceptions import NotFittedError from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold @@ -140,6 +148,8 @@ class _LGBMRegressorBase: # type: ignore _LGBMCheckClassificationTargets = None _LGBMComputeSampleWeight = None _LGBMValidateData = None + _sklearn_ClassifierTags = None + _sklearn_RegressorTags = None _sklearn_version = None # additional scikit-learn imports only for type hints diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index c4d1200e99e4..614e3c3cbe7f 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -40,6 +40,8 @@ _LGBMModelBase, _LGBMRegressorBase, _LGBMValidateData, + _sklearn_ClassifierTags, + _sklearn_RegressorTags, _sklearn_version, dt_DataTable, pd_DataFrame, @@ -703,7 +705,6 @@ def _update_sklearn_tags_from_dict( tags.input_tags.allow_nan = tags_dict["allow_nan"] tags.input_tags.sparse = "sparse" in tags_dict["X_types"] tags.target_tags.one_d_labels = "1dlabels" in tags_dict["X_types"] - tags._xfail_checks = tags_dict["_xfail_checks"] return tags def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]: @@ -1291,7 +1292,10 @@ def _more_tags(self) -> Dict[str, Any]: return tags def __sklearn_tags__(self) -> "_sklearn_Tags": - return LGBMModel.__sklearn_tags__(self) + tags = LGBMModel.__sklearn_tags__(self) + tags.estimator_type = "regressor" + tags.regressor_tags = _sklearn_RegressorTags(multi_label=False) + return tags def fit( # type: ignore[override] self, @@ -1350,7 +1354,10 @@ def _more_tags(self) -> Dict[str, Any]: return tags def __sklearn_tags__(self) -> "_sklearn_Tags": - return LGBMModel.__sklearn_tags__(self) + tags = LGBMModel.__sklearn_tags__(self) + tags.estimator_type = "classifier" + tags.classifier_tags = _sklearn_ClassifierTags(multi_class=True, multi_label=False) + return tags def fit( # type: ignore[override] self, diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 6eca66ff20d3..d187e9df5a9f 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -17,11 +17,18 @@ from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, r2_score from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split from sklearn.multioutput import ClassifierChain, MultiOutputClassifier, MultiOutputRegressor, RegressorChain -from sklearn.utils.estimator_checks import parametrize_with_checks +from sklearn.utils.estimator_checks import parametrize_with_checks as sklearn_parametrize_with_checks from sklearn.utils.validation import check_is_fitted import lightgbm as lgb -from lightgbm.compat import DATATABLE_INSTALLED, PANDAS_INSTALLED, dt_DataTable, pd_DataFrame, pd_Series +from lightgbm.compat import ( + DATATABLE_INSTALLED, + PANDAS_INSTALLED, + _sklearn_version, + dt_DataTable, + pd_DataFrame, + pd_Series, +) from .utils import ( assert_silent, @@ -35,6 +42,9 @@ softmax, ) +SKLEARN_MAJOR, SKLEARN_MINOR, *_ = _sklearn_version.split(".") +SKLEARN_VERSION_GTE_1_6 = (int(SKLEARN_MAJOR), int(SKLEARN_MINOR)) >= (1, 6) + decreasing_generator = itertools.count(0, -1) estimator_classes = (lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker) task_to_model_factory = { @@ -1432,7 +1442,28 @@ def test_getting_feature_names_in_pd_input(estimator_class): np.testing.assert_array_equal(model.feature_names_in_, X.columns) -@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()]) +# Starting with scikit-learn 1.6 (https://github.com/scikit-learn/scikit-learn/pull/30149), +# the only API for marking estimator tests as expected to fail is to pass a keyword argument +# to parametrize_with_checks(). That function didn't accept additional arguments in earlier +# versions. +# +# This block defines a patched version of parametrize_with_checks() so lightgbm's tests +# can be compatible with scikit-learn <1.6 and >=1.6. +# +# This should be removed once minimum supported scikit-learn version is at least 1.6. +if SKLEARN_VERSION_GTE_1_6: + parametrize_with_checks = sklearn_parametrize_with_checks +else: + + def parametrize_with_checks(estimator, *args, **kwargs): + return sklearn_parametrize_with_checks(estimator) + + +def _get_expected_failed_tests(estimator): + return estimator._more_tags()["_xfail_checks"] + + +@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()], expected_failed_checks=_get_expected_failed_tests) def test_sklearn_integration(estimator, check): estimator.set_params(min_child_samples=1, min_data_in_bin=1) check(estimator) @@ -1457,7 +1488,6 @@ def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimato assert sklearn_tags.input_tags.allow_nan is True assert sklearn_tags.input_tags.sparse is True assert sklearn_tags.target_tags.one_d_labels is True - assert sklearn_tags._xfail_checks == more_tags["_xfail_checks"] @pytest.mark.parametrize("task", all_tasks)