diff --git a/.ci/test.sh b/.ci/test.sh index 0c28943d2458..cf3f51e3899a 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -70,7 +70,7 @@ if [[ $TASK == "if-else" ]]; then exit 0 fi -conda install -q -y -n $CONDA_ENV joblib matplotlib numpy pandas psutil pytest python-graphviz scikit-learn scipy dask distributed dask-ml +conda install -q -y -n $CONDA_ENV dask dask-ml distributed joblib matplotlib numpy pandas psutil pytest python-graphviz scikit-learn scipy if [[ $OS_NAME == "macos" ]] && [[ $COMPILER == "clang" ]]; then # fix "OMP: Error #15: Initializing libiomp5.dylib, but found libomp.dylib already initialized." (OpenMP library conflict due to conda's MKL) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index f5ee52bee2e1..8f5ac1ee2ffb 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -1,3 +1,4 @@ +# coding: utf-8 """Distributed training with LightGBM and Dask.distributed. This module enables you to perform distributed training with LightGBM on Dask.Array and Dask.DataFrame collections. @@ -13,15 +14,11 @@ from dask import dataframe as dd from dask import delayed from dask.distributed import default_client, get_worker, wait -from toolz import assoc, first from .basic import _LIB, _safe_call from .sklearn import LGBMClassifier as LocalLGBMClassifier, LGBMRegressor as LocalLGBMRegressor -try: - import scipy.sparse as ss -except ImportError: - ss = False +import scipy.sparse as ss logger = logging.getLogger(__name__) @@ -32,7 +29,7 @@ def _parse_host_port(address): def _build_network_params(worker_addresses, local_worker_ip, local_listen_port, time_out): - """Build network parameters suiltable for LightGBM C backend. + """Build network parameters suitable for LightGBM C backend. Parameters ---------- @@ -60,7 +57,7 @@ def _concat(seq): return np.concatenate(seq, axis=0) elif isinstance(seq[0], (pd.DataFrame, pd.Series)): return pd.concat(seq, axis=0) - elif ss and isinstance(seq[0], ss.spmatrix): + elif isinstance(seq[0], ss.spmatrix): return ss.vstack(seq, format='csr') else: raise TypeError('Data must be one of: numpy arrays, pandas dataframes, sparse matrices (from scipy). Got %s.' % str(type(seq[0]))) @@ -131,9 +128,9 @@ def _train(client, data, label, params, model_factory, weight=None, **kwargs): who_has = client.who_has(parts) worker_map = defaultdict(list) for key, workers in who_has.items(): - worker_map[first(workers)].append(key_to_part_dict[key]) + worker_map[next(iter(workers))].append(key_to_part_dict[key]) - master_worker = first(worker_map) + master_worker = next(iter(worker_map)) worker_ncores = client.ncores() if 'tree_learner' not in params or params['tree_learner'].lower() not in {'data', 'feature', 'voting'}: @@ -144,7 +141,7 @@ def _train(client, data, label, params, model_factory, weight=None, **kwargs): # Tell each worker to train on the parts that it has locally futures_classifiers = [client.submit(_train_part, model_factory=model_factory, - params=assoc(params, 'num_threads', worker_ncores[worker]), + params={**params, 'num_threads': worker_ncores[worker]}, list_of_parts=list_of_parts, worker_addresses=list(worker_map.keys()), local_listen_port=params.get('local_listen_port', 12400), diff --git a/python-package/setup.py b/python-package/setup.py index ac8d0a0293ff..8b5bce1e5c6e 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -350,7 +350,6 @@ def run(self): 'dask[dataframe]>=2.0.0' 'dask[distributed]>=2.0.0', 'pandas', - 'toolz' ], }, maintainer='Guolin Ke', diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 5a445fa91163..2450e6ea5e5f 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1,3 +1,4 @@ +# coding: utf-8 import os import sys @@ -70,16 +71,16 @@ def _create_data(objective, n_samples=100, centers=2, output='array', chunk_size def test_classifier(output, centers, client, listen_port): # noqa X, y, w, dX, dy, dw = _create_data('classification', output=output, centers=centers) - a = dlgbm.LGBMClassifier(time_out=5, local_listen_port=listen_port) - a = a.fit(dX, dy, sample_weight=dw, client=client) - p1 = a.predict(dX) + classifier_a = dlgbm.LGBMClassifier(time_out=5, local_listen_port=listen_port) + classifier_a = classifier_a.fit(dX, dy, sample_weight=dw, client=client) + p1 = classifier_a.predict(dX) s1 = accuracy_score(dy, p1) p1 = p1.compute() - b = lightgbm.LGBMClassifier() - b.fit(X, y, sample_weight=w) - p2 = b.predict(X) - s2 = b.score(X, y) + classifier_b = lightgbm.LGBMClassifier() + classifier_b.fit(X, y, sample_weight=w) + p2 = classifier_b.predict(X) + s2 = classifier_b.score(X, y) assert_eq(s1, s2) @@ -162,11 +163,11 @@ def test_regressor_quantile(output, client, listen_port, alpha): # noqa q2 = np.count_nonzero(y < p2) / y.shape[0] # Quantiles should be right - np.isclose(q1, alpha, atol=.1) - np.isclose(q2, alpha, atol=.1) + np.testing.assert_allclose(q1, alpha, atol=0.2) + np.testing.assert_allclose(q2, alpha, atol=0.2) -def test_regressor_local_predict(client, listen_port): # noqa +def test_regressor_local_predict(client, listen_port): X, y, w, dX, dy, dw = _create_data('regression', output='array') a = dlgbm.LGBMRegressor(local_listen_port=listen_port, seed=42) @@ -179,7 +180,7 @@ def test_regressor_local_predict(client, listen_port): # noqa # Predictions and scores should be the same assert_eq(p1, p2) - np.isclose(s1, s2) + assert_eq(s1, s2) def test_build_network_params():