From 083e94039284dd6f49e711550ee68327719495e2 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 18 Jan 2021 22:49:55 -0600 Subject: [PATCH 1/4] [dask] allow parameter aliases for tree_learner and local_listen_port (fixes #3671) --- python-package/lightgbm/basic.py | 7 ++++++ python-package/lightgbm/dask.py | 32 ++++++++++++++++++++++---- tests/python_package_test/test_dask.py | 11 +++++---- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 5ab1f7128b08..7cbf0d4205ee 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -238,6 +238,9 @@ class _ConfigAliases: "sparse"}, "label_column": {"label_column", "label"}, + "local_listen_port": {"local_listen_port", + "local_port", + "port"}, "machines": {"machines", "workers", "nodes"}, @@ -261,6 +264,10 @@ class _ConfigAliases: "application"}, "pre_partition": {"pre_partition", "is_pre_partition"}, + "tree_learner": {"tree_learner", + "tree", + "tree_type", + "tree_learner_type"}, "two_round": {"two_round", "two_round_loading", "use_two_round_loading"}, diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index fb8b06077e70..750e9081746e 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -17,7 +17,7 @@ from dask import delayed from dask.distributed import Client, default_client, get_worker, wait -from .basic import _LIB, _safe_call +from .basic import _ConfigAliases, _LIB, _safe_call from .sklearn import LGBMClassifier, LGBMRegressor import scipy.sparse as ss @@ -197,15 +197,37 @@ def _train(client, data, label, params, model_factory, weight=None, **kwargs): master_worker = next(iter(worker_map)) worker_ncores = client.ncores() - if 'tree_learner' not in params or params['tree_learner'].lower() not in {'data', 'feature', 'voting'}: - logger.warning('Parameter tree_learner not set or set to incorrect value ' - '(%s), using "data" as default', params.get("tree_learner", None)) + tree_learner = None + for tree_learner_param in _ConfigAliases.get('tree_learner'): + tree_learner = params.get(tree_learner_param) + if tree_learner is not None: + break + + allowed_tree_learners = { + 'data', + 'data_parallel', + 'feature', + 'feature_parallel', + 'voting', + 'voting_parallel' + } + if tree_learner is None: + logger.warning('Parameter tree_learner not set. Using "data" as default"') params['tree_learner'] = 'data' + elif tree_learner.lower() not in allowed_tree_learners: + logger.warning('Parameter tree_learner set to %s, which is not allowed. Using "data" as default' % tree_learner) + params['tree_learner'] = 'data' + + local_listen_port = 12400 + for port_param in _ConfigAliases.get('local_listen_port'): + val = params.get(port_param) + if val is not None: + local_listen_port = val + break # find an open port on each worker. note that multiple workers can run # on the same machine, so this needs to ensure that each one gets its # own port - local_listen_port = params.get('local_listen_port', 12400) worker_address_to_port = _find_ports_for_workers( client=client, worker_addresses=worker_map.keys(), diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 901584dafd9c..e556cd9c89ca 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -124,7 +124,7 @@ def test_classifier_local_predict(client, listen_port): dask_classifier = dlgbm.DaskLGBMClassifier( time_out=5, - local_listen_port=listen_port, + local_port=listen_port, n_estimators=10, num_leaves=10 ) @@ -148,7 +148,8 @@ def test_regressor(output, client, listen_port): time_out=5, local_listen_port=listen_port, seed=42, - num_leaves=10 + num_leaves=10, + tree='data' ) dask_regressor = dask_regressor.fit(dX, dy, client=client, sample_weight=dw) p1 = dask_regressor.predict(dX) @@ -181,7 +182,8 @@ def test_regressor_quantile(output, client, listen_port, alpha): objective='quantile', alpha=alpha, n_estimators=10, - num_leaves=10 + num_leaves=10, + tree_learner_type='data_parallel' ) dask_regressor = dask_regressor.fit(dX, dy, client=client, sample_weight=dw) p1 = dask_regressor.predict(dX).compute() @@ -210,7 +212,8 @@ def test_regressor_local_predict(client, listen_port): local_listen_port=listen_port, seed=42, n_estimators=10, - num_leaves=10 + num_leaves=10, + tree_type='data' ) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw, client=client) p1 = dask_regressor.predict(dX) From 0f6331384dab36170d405b0ed8febf87c2689517 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 18 Jan 2021 23:14:09 -0600 Subject: [PATCH 2/4] num_thread too --- python-package/lightgbm/basic.py | 5 +++++ python-package/lightgbm/dask.py | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 7cbf0d4205ee..565a45e55846 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -258,6 +258,11 @@ class _ConfigAliases: "num_rounds", "num_boost_round", "n_estimators"}, + "num_threads": {"num_threads", + "num_thread", + "nthread", + "nthreads", + "n_jobs"}, "objective": {"objective", "objective_type", "app", diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 750e9081746e..924471d3dfe2 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -7,6 +7,7 @@ import logging import socket from collections import defaultdict +from copy import deepcopy from typing import Dict, Iterable from urllib.parse import urlparse @@ -170,6 +171,8 @@ def _train(client, data, label, params, model_factory, weight=None, **kwargs): sample_weight : array-like of shape = [n_samples] or None, optional (default=None) Weights of training data. """ + params = deepcopy(params) + # Split arrays/dataframes into parts. Arrange parts into tuples to enforce co-locality data_parts = _split_to_parts(data, is_matrix=True) label_parts = _split_to_parts(label, is_matrix=False) @@ -234,6 +237,10 @@ def _train(client, data, label, params, model_factory, weight=None, **kwargs): local_listen_port=local_listen_port ) + # num_threads is set below, so remove it and all aliases of it from params + for num_thread_alias in _ConfigAliases.get('num_threads'): + _ = params.pop(num_thread_alias, None) + # Tell each worker to train on the parts that it has locally futures_classifiers = [client.submit(_train_part, model_factory=model_factory, From 3778fa7a9bafb8f94dbefa103cad4b1e47cddfed Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 19 Jan 2021 15:09:56 -0600 Subject: [PATCH 3/4] Apply suggestions from code review Co-authored-by: Nikita Titov --- python-package/lightgbm/dask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index aed0e71a5d72..ba84edca87c2 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -215,7 +215,7 @@ def _train(client, data, label, params, model_factory, weight=None, **kwargs): 'voting_parallel' } if tree_learner is None: - logger.warning('Parameter tree_learner not set. Using "data" as default"') + logger.warning('Parameter tree_learner not set. Using "data" as default') params['tree_learner'] = 'data' elif tree_learner.lower() not in allowed_tree_learners: logger.warning('Parameter tree_learner set to %s, which is not allowed. Using "data" as default' % tree_learner) @@ -239,7 +239,7 @@ def _train(client, data, label, params, model_factory, weight=None, **kwargs): # num_threads is set below, so remove it and all aliases of it from params for num_thread_alias in _ConfigAliases.get('num_threads'): - _ = params.pop(num_thread_alias, None) + params.pop(num_thread_alias, None) # Tell each worker to train on the parts that it has locally futures_classifiers = [client.submit(_train_part, From db84675ef0a66f271460f6fcfb7edcf506b920ea Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 20 Jan 2021 10:34:56 -0600 Subject: [PATCH 4/4] empty commit