From 4bf9f9545589aeb808cf1b7e8ca38ac237a9982e Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 9 Sep 2021 12:45:50 +0100 Subject: [PATCH 1/7] [ci] skip Dask tests on QEMU builds (#4600) --- tests/python_package_test/test_dask.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 2f84933cfd37..84047abf1f61 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -7,6 +7,7 @@ import socket from itertools import groupby from os import getenv +from platform import machine from sys import platform import pytest @@ -15,6 +16,8 @@ if not platform.startswith('linux'): pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True) +if machine() != 'x86_64': + pytest.skip('lightgbm.dask tests are currently skipped on some architectures like arm64', allow_module_level=True) if not lgb.compat.DASK_INSTALLED: pytest.skip('Dask is not installed', allow_module_level=True) From d411bced3a52493c141232683b03f3d6ad79e632 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 9 Sep 2021 16:09:58 +0100 Subject: [PATCH 2/7] prefer spaces to tabs in `CMakeLists.txt` (#4593) Co-authored-by: Nikita Titov --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1fac21045e96..aa25d8c4237b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -405,9 +405,9 @@ if(USE_SWIG) COMMAND "${Java_JAR_EXECUTABLE}" -cf lightgbmlib.jar com) else() add_custom_command(TARGET _lightgbm_swig POST_BUILD - COMMAND "${Java_JAVAC_EXECUTABLE}" -d . java/*.java - COMMAND cp "${PROJECT_SOURCE_DIR}/*.so" com/microsoft/ml/lightgbm/linux/x86_64 - COMMAND "${Java_JAR_EXECUTABLE}" -cf lightgbmlib.jar com) + COMMAND "${Java_JAVAC_EXECUTABLE}" -d . java/*.java + COMMAND cp "${PROJECT_SOURCE_DIR}/*.so" com/microsoft/ml/lightgbm/linux/x86_64 + COMMAND "${Java_JAR_EXECUTABLE}" -cf lightgbmlib.jar com) endif() endif(USE_SWIG) From 5857ef5e38269f5cde80de2afd9c9a3fff75e612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Thu, 9 Sep 2021 11:09:14 -0500 Subject: [PATCH 3/7] [tests][dask] Use workers hostname in tests (fixes #4594) (#4595) Co-authored-by: Nikita Titov --- tests/python_package_test/test_dask.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 84047abf1f61..755c2549fbe8 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -9,6 +9,7 @@ from os import getenv from platform import machine from sys import platform +from urllib.parse import urlparse import pytest @@ -87,6 +88,11 @@ def listen_port(): listen_port.port = 13000 +def _get_workers_hostname(cluster: LocalCluster) -> str: + one_worker_address = next(iter(cluster.scheduler_info['workers'])) + return urlparse(one_worker_address).hostname + + def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs): X, y, g = make_ranking(n_samples=n_samples, random_state=42, **kwargs) rnd = np.random.RandomState(42) @@ -485,8 +491,9 @@ def test_training_does_not_fail_on_port_conflicts(cluster): _, _, _, _, dX, dy, dw, _ = _create_data('binary-classification', output='array') lightgbm_default_port = 12400 + workers_hostname = _get_workers_hostname(cluster) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(('127.0.0.1', lightgbm_default_port)) + s.bind((workers_hostname, lightgbm_default_port)) dask_classifier = lgb.DaskLGBMClassifier( client=client, time_out=5, @@ -1395,13 +1402,14 @@ def test_network_params_not_required_but_respected_if_given(task, listen_port, c assert 'machines' not in params # model 2 - machines given + workers_hostname = _get_workers_hostname(cluster) n_workers = len(client.scheduler_info()['workers']) open_ports = lgb.dask._find_n_open_ports(n_workers) dask_model2 = dask_model_factory( n_estimators=5, num_leaves=5, machines=",".join([ - f"127.0.0.1:{port}" + f"{workers_hostname}:{port}" for port in open_ports ]), ) @@ -1442,12 +1450,13 @@ def test_machines_should_be_used_if_provided(task, cluster): n_workers = len(client.scheduler_info()['workers']) assert n_workers > 1 + workers_hostname = _get_workers_hostname(cluster) open_ports = lgb.dask._find_n_open_ports(n_workers) dask_model = dask_model_factory( n_estimators=5, num_leaves=5, machines=",".join([ - f"127.0.0.1:{port}" + f"{workers_hostname}:{port}" for port in open_ports ]), ) @@ -1457,7 +1466,7 @@ def test_machines_should_be_used_if_provided(task, cluster): error_msg = f"Binding port {open_ports[0]} failed" with pytest.raises(lgb.basic.LightGBMError, match=error_msg): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(('127.0.0.1', open_ports[0])) + s.bind((workers_hostname, open_ports[0])) dask_model.fit(dX, dy, group=dg) # The above error leaves a worker waiting From fa1d06f136fde35bdb3a860c999d08da5517fa16 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 9 Sep 2021 17:10:03 +0100 Subject: [PATCH 4/7] [docs] add lightgbm_ray to docs (#4584) * [docs] add lightgbm_ray to docs * add docs link * Apply suggestions from code review Co-authored-by: Nikita Titov * move Ray * Update docs/Parallel-Learning-Guide.rst Co-authored-by: Nikita Titov Co-authored-by: Nikita Titov Co-authored-by: Nikita Titov --- README.md | 2 ++ docs/Parallel-Learning-Guide.rst | 19 ++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed6346162469..af0c9b68ba91 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,8 @@ Kubeflow Fairing (LightGBM on Kubernetes): https://github.com/kubeflow/fairing Kubeflow Operator (LightGBM on Kubernetes): https://github.com/kubeflow/xgboost-operator +lightgbm_ray (LightGBM on Ray): https://github.com/ray-project/lightgbm_ray + ML.NET (.NET/C#-package): https://github.com/dotnet/machinelearning LightGBM.NET (.NET/C#-package): https://github.com/rca22/LightGBM.Net diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst index 45a89312b1c0..5c2021dbb4c8 100644 --- a/docs/Parallel-Learning-Guide.rst +++ b/docs/Parallel-Learning-Guide.rst @@ -435,7 +435,7 @@ MPI Version 3. Run following command on one machine (not need to run on all machines), need to change ``your_config_file`` to real config file. For Windows: - + .. code:: mpiexec.exe /machinefile mlist.txt lightgbm.exe config=your_config_file @@ -451,6 +451,17 @@ Example - `A simple distributed learning example`_ +Ray +^^^ + +`Ray`_ is a Python-based framework for distributed computing. The `lightgbm_ray`_ project, maintained within the official Ray GitHub organization, can be used to perform distributed LightGBM training using ``ray``. + +See `the lightgbm_ray documentation`_ for usage examples. + +.. note:: + + ``lightgbm_ray`` is not maintained by LightGBM's maintainers. Bug reports or feature requests should be directed to https://github.com/ray-project/lightgbm_ray/issues. + .. _Dask: https://docs.dask.org/en/latest/ .. _SynapseML: https://aka.ms/spark @@ -482,3 +493,9 @@ Example .. _here: https://www.youtube.com/watch?v=iqzXhp5TxUY .. _A simple distributed learning example: https://github.com/microsoft/lightgbm/tree/master/examples/parallel_learning + +.. _lightgbm_ray: https://github.com/ray-project/lightgbm_ray + +.. _Ray: https://ray.io/ + +.. _the lightgbm_ray documentation: https://docs.ray.io/en/latest/lightgbm-ray.html From c84900753c0332b3ba3e931fb2e4af54bccc67d7 Mon Sep 17 00:00:00 2001 From: Nikita Titov Date: Fri, 10 Sep 2021 05:42:11 +0300 Subject: [PATCH 5/7] [ci] fix link to LightGBM public e-mail (#4603) --- .github/workflows/r_solaris.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r_solaris.yml b/.github/workflows/r_solaris.yml index ee1a581c23b5..662307fce35d 100644 --- a/.github/workflows/r_solaris.yml +++ b/.github/workflows/r_solaris.yml @@ -48,7 +48,7 @@ jobs: url=${line#*@} body="${body}**${platform}**: ${url}\r\n" done < "$GITHUB_WORKSPACE/rhub_logs.txt" || true - body="${body}Reports also have been sent to LightGBM public e-mail: http://www.yopmail.com/lightgbm_rhub_checks\r\n" + body="${body}Reports also have been sent to LightGBM public e-mail: https://yopmail.com?lightgbm_rhub_checks\r\n" body="${body}Status: ${{ job.status }}." $GITHUB_WORKSPACE/.ci/append_comment.sh \ "${{ github.event.client_payload.comment_number }}" \ From 79463dfb116c52f1b6b05dcd513b3415cc7d6e1d Mon Sep 17 00:00:00 2001 From: Nikita Titov Date: Fri, 10 Sep 2021 06:33:39 +0300 Subject: [PATCH 6/7] [python] [sklearn] respect `eval_at` aliases in keyword arguments (#4599) --- python-package/lightgbm/sklearn.py | 11 +++++++---- tests/python_package_test/test_sklearn.py | 13 +++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index c59582464f88..32cb983b6f27 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -26,7 +26,7 @@ def __init__(self, func): Parameters ---------- func : callable - Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group) + Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group)`` and returns (grad, hess): y_true : array-like of shape = [n_samples] @@ -611,9 +611,12 @@ def fit(self, X, y, params.pop(alias, None) params['num_class'] = self._n_classes if hasattr(self, '_eval_at'): + eval_at = self._eval_at for alias in _ConfigAliases.get('eval_at'): - params.pop(alias, None) - params['eval_at'] = self._eval_at + if alias in params: + _log_warning(f"Found '{alias}' in params. Will use it instead of 'eval_at' argument") + eval_at = params.pop(alias) + params['eval_at'] = eval_at params['objective'] = self._objective if self._fobj: params['objective'] = 'None' # objective = nullptr for unknown objective @@ -752,7 +755,7 @@ def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None, pred_leaf=False, pred_contrib=False, **kwargs): """Docstring is set after definition, using a template.""" if self._n_features is None: - raise LGBMNotFittedError("Estimator not fitted, call `fit` before exploiting the model.") + raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.") if not isinstance(X, (pd_DataFrame, dt_DataTable)): X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False) n_features = X.shape[1] diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index da5fc8ba30c2..6b1ac8a9f3d2 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -143,6 +143,19 @@ def test_xendcg(): assert gbm.best_score_['valid_0']['ndcg@3'] > 0.6253 +def test_eval_at_aliases(): + rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank' + X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train')) + X_test, y_test = load_svmlight_file(str(rank_example_dir / 'rank.test')) + q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query')) + q_test = np.loadtxt(str(rank_example_dir / 'rank.test.query')) + for alias in ('eval_at', 'ndcg_eval_at', 'ndcg_at', 'map_eval_at', 'map_at'): + gbm = lgb.LGBMRanker(n_estimators=5, **{alias: [1, 2, 3, 9]}) + with pytest.warns(UserWarning, match=f"Found '{alias}' in params. Will use it instead of 'eval_at' argument"): + gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test]) + assert list(gbm.evals_result_['valid_0'].keys()) == ['ndcg@1', 'ndcg@2', 'ndcg@3', 'ndcg@9'] + + def test_regression_with_custom_objective(): X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) From a08c37f647125ca3a86906340a161d434dbba98d Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 10 Sep 2021 05:20:46 +0100 Subject: [PATCH 7/7] [R-package] avoid unnecessary computation and add tests for Dataset set_reference() method (#4587) * [R-package] avoid unnecessary computation in Dataset set_reference() method * re-arrange conditions * do more validation upfront and add tests * Update R-package/tests/testthat/test_dataset.R Co-authored-by: Nikita Titov * Update R-package/tests/testthat/test_dataset.R Co-authored-by: Nikita Titov Co-authored-by: Nikita Titov --- R-package/R/lgb.Dataset.R | 27 ++---- R-package/tests/testthat/test_dataset.R | 116 ++++++++++++++++++++++++ 2 files changed, 126 insertions(+), 17 deletions(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 8badc3ee3e5c..429eb1f91275 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -663,34 +663,27 @@ Dataset <- R6::R6Class( # Set reference set_reference = function(reference) { - # Set known references - self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature) - self$set_colnames(colnames = reference$get_colnames()) - private$set_predictor(predictor = reference$.__enclos_env__$private$predictor) - - # Check for identical references + # setting reference to this same Dataset object doesn't require any changes if (identical(private$reference, reference)) { return(invisible(self)) } - # Check for empty data + # changing the reference removes the Dataset object on the C++ side, so it should only + # be done if you still have the raw_data available, so that the new Dataset can be reconstructed if (is.null(private$raw_data)) { - stop("set_reference: cannot set reference after freeing raw data, please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset") - } - # Check for non-existing reference - if (!is.null(reference)) { - - # Reference is unknown - if (!lgb.is.Dataset(reference)) { - stop("set_reference: Can only use lgb.Dataset as a reference") - } - + if (!lgb.is.Dataset(reference)) { + stop("set_reference: Can only use lgb.Dataset as a reference") } + # Set known references + self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature) + self$set_colnames(colnames = reference$get_colnames()) + private$set_predictor(predictor = reference$.__enclos_env__$private$predictor) + # Store reference private$reference <- reference diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index 99982b0de36c..04ca2e3d1cd3 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -1,5 +1,9 @@ context("testing lgb.Dataset functionality") +data(agaricus.train, package = "lightgbm") +train_data <- agaricus.train$data[seq_len(1000L), ] +train_label <- agaricus.train$label[seq_len(1000L)] + data(agaricus.test, package = "lightgbm") test_data <- agaricus.test$data[1L:100L, ] test_label <- agaricus.test$label[1L:100L] @@ -74,6 +78,118 @@ test_that("Dataset$slice() supports passing Dataset attributes through '...'", { expect_identical(dsub1$getinfo("init_score"), init_score) }) +test_that("Dataset$set_reference() on a constructed Dataset fails if raw data has been freed", { + dtrain <- lgb.Dataset(train_data, label = train_label) + dtrain$construct() + dtest <- lgb.Dataset(test_data, label = test_label) + dtest$construct() + expect_error({ + dtest$set_reference(dtrain) + }, regexp = "cannot set reference after freeing raw data") +}) + +test_that("Dataset$set_reference() fails if reference is not a Dataset", { + dtrain <- lgb.Dataset( + train_data + , label = train_label + , free_raw_data = FALSE + ) + expect_error({ + dtrain$set_reference(reference = data.frame(x = rnorm(10L))) + }, regexp = "Can only use lgb.Dataset as a reference") + + # passing NULL when the Dataset already has a reference raises an error + dtest <- lgb.Dataset( + test_data + , label = test_label + , free_raw_data = FALSE + ) + dtrain$set_reference(dtest) + expect_error({ + dtrain$set_reference(reference = NULL) + }, regexp = "Can only use lgb.Dataset as a reference") +}) + +test_that("Dataset$set_reference() setting reference to the same Dataset has no side effects", { + dtrain <- lgb.Dataset( + train_data + , label = train_label + , free_raw_data = FALSE + , categorical_feature = c(2L, 3L) + ) + dtrain$construct() + + cat_features_before <- dtrain$.__enclos_env__$private$categorical_feature + colnames_before <- dtrain$get_colnames() + predictor_before <- dtrain$.__enclos_env__$private$predictor + + dtrain$set_reference(dtrain) + expect_identical( + cat_features_before + , dtrain$.__enclos_env__$private$categorical_feature + ) + expect_identical( + colnames_before + , dtrain$get_colnames() + ) + expect_identical( + predictor_before + , dtrain$.__enclos_env__$private$predictor + ) +}) + +test_that("Dataset$set_reference() updates categorical_feature, colnames, and predictor", { + dtrain <- lgb.Dataset( + train_data + , label = train_label + , free_raw_data = FALSE + , categorical_feature = c(2L, 3L) + ) + dtrain$construct() + bst <- Booster$new( + train_set = dtrain + , params = list(verbose = -1L) + ) + dtrain$.__enclos_env__$private$predictor <- bst$to_predictor() + + test_original_feature_names <- paste0("feature_col_", seq_len(ncol(test_data))) + dtest <- lgb.Dataset( + test_data + , label = test_label + , free_raw_data = FALSE + , colnames = test_original_feature_names + ) + dtest$construct() + + # at this point, dtest should not have categorical_feature + expect_null(dtest$.__enclos_env__$private$predictor) + expect_null(dtest$.__enclos_env__$private$categorical_feature) + expect_identical( + dtest$get_colnames() + , test_original_feature_names + ) + + dtest$set_reference(dtrain) + + # after setting reference to dtrain, those attributes should have dtrain's values + expect_is(dtest$.__enclos_env__$private$predictor, "lgb.Predictor") + expect_identical( + dtest$.__enclos_env__$private$predictor$.__enclos_env__$private$handle + , dtrain$.__enclos_env__$private$predictor$.__enclos_env__$private$handle + ) + expect_identical( + dtest$.__enclos_env__$private$categorical_feature + , dtrain$.__enclos_env__$private$categorical_feature + ) + expect_identical( + dtest$get_colnames() + , dtrain$get_colnames() + ) + expect_false( + identical(dtest$get_colnames(), test_original_feature_names) + ) +}) + test_that("lgb.Dataset: colnames", { dtest <- lgb.Dataset(test_data, label = test_label) expect_equal(colnames(dtest), colnames(test_data))