From eccd8078d5d82205b4babfb070d310f79d3ed5d0 Mon Sep 17 00:00:00 2001 From: Nikita Titov Date: Fri, 12 Mar 2021 03:54:49 +0300 Subject: [PATCH 1/4] Update Parallel-Learning-Guide.rst --- docs/Parallel-Learning-Guide.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst index 9486a0d8cd23..0168b5c7cbf7 100644 --- a/docs/Parallel-Learning-Guide.rst +++ b/docs/Parallel-Learning-Guide.rst @@ -65,7 +65,7 @@ LightGBM's Python package supports distributed learning via `Dask`_. This integr Dask Examples ''''''''''''' -For sample code using ``lightgbm.dask``, see `these Dask examples`_ . +For sample code using ``lightgbm.dask``, see `these Dask examples`_. Training with Dask '''''''''''''''''' @@ -121,9 +121,9 @@ While setting up for training, ``lightgbm`` will concatenate all of the partitio When setting up data partitioning for LightGBM training with Dask, try to follow these suggestions: -* ensure that each worker in the cluster has some of the training data -* try to give each worker roughly the same amount of data, especially if your dataset is small -* if you plan to train multiple models (for example, to tune hyperparameters) on the same data, use ``client.persist()`` before training to materialize the data one time +* ensure that each worker in the cluster has some of the training data; +* try to give each worker roughly the same amount of data, especially if your dataset is small; +* if you plan to train multiple models (for example, to tune hyperparameters) on the same data, use ``client.persist()`` before training to materialize the data one time. Using a Specific Dask Client **************************** @@ -195,9 +195,9 @@ If you are running multiple Dask worker processes on physical host in the cluste Providing ``machines`` gives you complete control over the networking details of training, but it also makes the training process fragile. Training will fail if you use ``machines`` and any of the following are true: - * any of the ports mentioned in ``machines`` are not open when training begins - * some partitions of the training data are held by machines that that are not present in ``machines`` - * some machines mentioned in ``machines`` do not hold any of the training data + * any of the ports mentioned in ``machines`` are not open when training begins; + * some partitions of the training data are held by machines that that are not present in ``machines``; + * some machines mentioned in ``machines`` do not hold any of the training data. **Option 2: specify one port to use on every worker** @@ -223,8 +223,8 @@ You could edit your firewall rules to allow communication between any of the wor Providing ``local_listen_port`` is slightly less fragile than ``machines`` because LightGBM will automatically figure out which workers have pieces of the training data. However, using this method, training can fail if any of the following are true: - * the port ``local_listen_port`` is not open on any of the worker hosts - * any machine has multiple Dask worker processes running on it + * the port ``local_listen_port`` is not open on any of the worker hosts; + * any machine has multiple Dask worker processes running on it. Prediction with Dask '''''''''''''''''''' @@ -335,10 +335,10 @@ The lowest-level model object in LightGBM is the ``lightgbm.Booster``. After tra From the point forward, you can use any of the following methods to save the Booster: -* serialize with ``cloudpickle``, ``joblib``, or ``pickle`` -* ``bst.dump_model()``: dump the model to a dictionary which could be written out as JSON -* ``bst.model_to_string()``: dump the model to a string in memory -* ``bst.save_model()``: write the output of ``bst.model_to_string()`` to a text file +* serialize with ``cloudpickle``, ``joblib``, or ``pickle``; +* ``bst.dump_model()``: dump the model to a dictionary which could be written out as JSON; +* ``bst.model_to_string()``: dump the model to a string in memory; +* ``bst.save_model()``: write the output of ``bst.model_to_string()`` to a text file. Kubeflow ^^^^^^^^ From 841990c43f64789b60be6f932ec2449f0afa4020 Mon Sep 17 00:00:00 2001 From: Nikita Titov Date: Fri, 12 Mar 2021 04:03:05 +0300 Subject: [PATCH 2/4] Update test.sh --- .ci/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/test.sh b/.ci/test.sh index e66790866d77..65eac1b58225 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -224,7 +224,7 @@ import matplotlib\ matplotlib.use\(\"Agg\"\)\ ' plot_example.py # prevent interactive window mode sed -i'.bak' 's/graph.render(view=True)/graph.render(view=False)/' plot_example.py - for f in *.py; do python $f || exit -1; done # run all examples + for f in *.py **/*.py; do python $f || exit -1; done # run all examples cd $BUILD_DIRECTORY/examples/python-guide/notebooks conda install -q -y -n $CONDA_ENV ipywidgets notebook jupyter nbconvert --ExecutePreprocessor.timeout=180 --to notebook --execute --inplace *.ipynb || exit -1 # run all notebooks From cf324d21fe9c16f2275c48f94655fe27a76d7f61 Mon Sep 17 00:00:00 2001 From: StrikerRUS Date: Fri, 12 Mar 2021 04:33:41 +0300 Subject: [PATCH 3/4] fix path --- examples/python-guide/dask/binary-classification.py | 5 +++++ .../python-guide/dask/multiclass-classification.py | 5 +++++ examples/python-guide/dask/prediction.py | 5 +++++ examples/python-guide/dask/ranking.py | 12 ++++++++++-- examples/python-guide/dask/regression.py | 5 +++++ 5 files changed, 30 insertions(+), 2 deletions(-) diff --git a/examples/python-guide/dask/binary-classification.py b/examples/python-guide/dask/binary-classification.py index 4313e8da3ddb..d86fe3536df5 100644 --- a/examples/python-guide/dask/binary-classification.py +++ b/examples/python-guide/dask/binary-classification.py @@ -1,3 +1,5 @@ +import sys + import dask.array as da from distributed import Client, LocalCluster from sklearn.datasets import make_blobs @@ -5,6 +7,9 @@ import lightgbm as lgb if __name__ == "__main__": + if not sys.platform.startswith('linux'): + print('lightgbm.dask is currently supported in Linux environments') + sys.exit(0) print("loading data") diff --git a/examples/python-guide/dask/multiclass-classification.py b/examples/python-guide/dask/multiclass-classification.py index 8e40b35a8121..f17de98f2a84 100644 --- a/examples/python-guide/dask/multiclass-classification.py +++ b/examples/python-guide/dask/multiclass-classification.py @@ -1,3 +1,5 @@ +import sys + import dask.array as da from distributed import Client, LocalCluster from sklearn.datasets import make_blobs @@ -5,6 +7,9 @@ import lightgbm as lgb if __name__ == "__main__": + if not sys.platform.startswith('linux'): + print('lightgbm.dask is currently supported in Linux environments') + sys.exit(0) print("loading data") diff --git a/examples/python-guide/dask/prediction.py b/examples/python-guide/dask/prediction.py index 64e2bae0c08d..19c6ad75b51f 100644 --- a/examples/python-guide/dask/prediction.py +++ b/examples/python-guide/dask/prediction.py @@ -1,3 +1,5 @@ +import sys + import dask.array as da from distributed import Client, LocalCluster from sklearn.datasets import make_regression @@ -6,6 +8,9 @@ import lightgbm as lgb if __name__ == "__main__": + if not sys.platform.startswith('linux'): + print('lightgbm.dask is currently supported in Linux environments') + sys.exit(0) print("loading data") diff --git a/examples/python-guide/dask/ranking.py b/examples/python-guide/dask/ranking.py index b7cae20a44c4..67b1080d2515 100644 --- a/examples/python-guide/dask/ranking.py +++ b/examples/python-guide/dask/ranking.py @@ -1,3 +1,6 @@ +import os +import sys + import dask.array as da import numpy as np from distributed import Client, LocalCluster @@ -6,11 +9,16 @@ import lightgbm as lgb if __name__ == "__main__": + if not sys.platform.startswith('linux'): + print('lightgbm.dask is currently supported in Linux environments') + sys.exit(0) print("loading data") - X, y = load_svmlight_file("../lambdarank/rank.train") - group = np.loadtxt("../lambdarank/rank.train.query") + X, y = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../../lambdarank/rank.train')) + group = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../../lambdarank/rank.train.query')) print("initializing a Dask cluster") diff --git a/examples/python-guide/dask/regression.py b/examples/python-guide/dask/regression.py index 69a8f764732d..5c2766a3d947 100644 --- a/examples/python-guide/dask/regression.py +++ b/examples/python-guide/dask/regression.py @@ -1,3 +1,5 @@ +import sys + import dask.array as da from distributed import Client, LocalCluster from sklearn.datasets import make_regression @@ -5,6 +7,9 @@ import lightgbm as lgb if __name__ == "__main__": + if not sys.platform.startswith('linux'): + print('lightgbm.dask is currently supported in Linux environments') + sys.exit(0) print("loading data") From b2c1685921774b33014046d36b8dc5994d841755 Mon Sep 17 00:00:00 2001 From: StrikerRUS Date: Sun, 14 Mar 2021 19:35:40 +0300 Subject: [PATCH 4/4] address review comments --- docs/Parallel-Learning-Guide.rst | 28 +++++++++++-------- .../dask/binary-classification.py | 6 ---- .../dask/multiclass-classification.py | 6 ---- examples/python-guide/dask/prediction.py | 6 ---- examples/python-guide/dask/ranking.py | 5 ---- examples/python-guide/dask/regression.py | 6 ---- python-package/README.rst | 4 +++ 7 files changed, 20 insertions(+), 41 deletions(-) diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst index 0168b5c7cbf7..7bcb3fdfa865 100644 --- a/docs/Parallel-Learning-Guide.rst +++ b/docs/Parallel-Learning-Guide.rst @@ -62,6 +62,10 @@ Dask LightGBM's Python package supports distributed learning via `Dask`_. This integration is maintained by LightGBM's maintainers. +.. warning:: + + Dask integration is only tested on Linux. + Dask Examples ''''''''''''' @@ -121,9 +125,9 @@ While setting up for training, ``lightgbm`` will concatenate all of the partitio When setting up data partitioning for LightGBM training with Dask, try to follow these suggestions: -* ensure that each worker in the cluster has some of the training data; -* try to give each worker roughly the same amount of data, especially if your dataset is small; -* if you plan to train multiple models (for example, to tune hyperparameters) on the same data, use ``client.persist()`` before training to materialize the data one time. +* ensure that each worker in the cluster has some of the training data +* try to give each worker roughly the same amount of data, especially if your dataset is small +* if you plan to train multiple models (for example, to tune hyperparameters) on the same data, use ``client.persist()`` before training to materialize the data one time Using a Specific Dask Client **************************** @@ -195,9 +199,9 @@ If you are running multiple Dask worker processes on physical host in the cluste Providing ``machines`` gives you complete control over the networking details of training, but it also makes the training process fragile. Training will fail if you use ``machines`` and any of the following are true: - * any of the ports mentioned in ``machines`` are not open when training begins; - * some partitions of the training data are held by machines that that are not present in ``machines``; - * some machines mentioned in ``machines`` do not hold any of the training data. + * any of the ports mentioned in ``machines`` are not open when training begins + * some partitions of the training data are held by machines that that are not present in ``machines`` + * some machines mentioned in ``machines`` do not hold any of the training data **Option 2: specify one port to use on every worker** @@ -223,8 +227,8 @@ You could edit your firewall rules to allow communication between any of the wor Providing ``local_listen_port`` is slightly less fragile than ``machines`` because LightGBM will automatically figure out which workers have pieces of the training data. However, using this method, training can fail if any of the following are true: - * the port ``local_listen_port`` is not open on any of the worker hosts; - * any machine has multiple Dask worker processes running on it. + * the port ``local_listen_port`` is not open on any of the worker hosts + * any machine has multiple Dask worker processes running on it Prediction with Dask '''''''''''''''''''' @@ -335,10 +339,10 @@ The lowest-level model object in LightGBM is the ``lightgbm.Booster``. After tra From the point forward, you can use any of the following methods to save the Booster: -* serialize with ``cloudpickle``, ``joblib``, or ``pickle``; -* ``bst.dump_model()``: dump the model to a dictionary which could be written out as JSON; -* ``bst.model_to_string()``: dump the model to a string in memory; -* ``bst.save_model()``: write the output of ``bst.model_to_string()`` to a text file. +* serialize with ``cloudpickle``, ``joblib``, or ``pickle`` +* ``bst.dump_model()``: dump the model to a dictionary which could be written out as JSON +* ``bst.model_to_string()``: dump the model to a string in memory +* ``bst.save_model()``: write the output of ``bst.model_to_string()`` to a text file Kubeflow ^^^^^^^^ diff --git a/examples/python-guide/dask/binary-classification.py b/examples/python-guide/dask/binary-classification.py index d86fe3536df5..4de9245d4472 100644 --- a/examples/python-guide/dask/binary-classification.py +++ b/examples/python-guide/dask/binary-classification.py @@ -1,5 +1,3 @@ -import sys - import dask.array as da from distributed import Client, LocalCluster from sklearn.datasets import make_blobs @@ -7,10 +5,6 @@ import lightgbm as lgb if __name__ == "__main__": - if not sys.platform.startswith('linux'): - print('lightgbm.dask is currently supported in Linux environments') - sys.exit(0) - print("loading data") X, y = make_blobs(n_samples=1000, n_features=50, centers=2) diff --git a/examples/python-guide/dask/multiclass-classification.py b/examples/python-guide/dask/multiclass-classification.py index f17de98f2a84..bcda9589ab84 100644 --- a/examples/python-guide/dask/multiclass-classification.py +++ b/examples/python-guide/dask/multiclass-classification.py @@ -1,5 +1,3 @@ -import sys - import dask.array as da from distributed import Client, LocalCluster from sklearn.datasets import make_blobs @@ -7,10 +5,6 @@ import lightgbm as lgb if __name__ == "__main__": - if not sys.platform.startswith('linux'): - print('lightgbm.dask is currently supported in Linux environments') - sys.exit(0) - print("loading data") X, y = make_blobs(n_samples=1000, n_features=50, centers=3) diff --git a/examples/python-guide/dask/prediction.py b/examples/python-guide/dask/prediction.py index 19c6ad75b51f..a4cb5cd8592e 100644 --- a/examples/python-guide/dask/prediction.py +++ b/examples/python-guide/dask/prediction.py @@ -1,5 +1,3 @@ -import sys - import dask.array as da from distributed import Client, LocalCluster from sklearn.datasets import make_regression @@ -8,10 +6,6 @@ import lightgbm as lgb if __name__ == "__main__": - if not sys.platform.startswith('linux'): - print('lightgbm.dask is currently supported in Linux environments') - sys.exit(0) - print("loading data") X, y = make_regression(n_samples=1000, n_features=50) diff --git a/examples/python-guide/dask/ranking.py b/examples/python-guide/dask/ranking.py index 67b1080d2515..5693ed9a5b67 100644 --- a/examples/python-guide/dask/ranking.py +++ b/examples/python-guide/dask/ranking.py @@ -1,5 +1,4 @@ import os -import sys import dask.array as da import numpy as np @@ -9,10 +8,6 @@ import lightgbm as lgb if __name__ == "__main__": - if not sys.platform.startswith('linux'): - print('lightgbm.dask is currently supported in Linux environments') - sys.exit(0) - print("loading data") X, y = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), diff --git a/examples/python-guide/dask/regression.py b/examples/python-guide/dask/regression.py index 5c2766a3d947..4d15547ff501 100644 --- a/examples/python-guide/dask/regression.py +++ b/examples/python-guide/dask/regression.py @@ -1,5 +1,3 @@ -import sys - import dask.array as da from distributed import Client, LocalCluster from sklearn.datasets import make_regression @@ -7,10 +5,6 @@ import lightgbm as lgb if __name__ == "__main__": - if not sys.platform.startswith('linux'): - print('lightgbm.dask is currently supported in Linux environments') - sys.exit(0) - print("loading data") X, y = make_regression(n_samples=1000, n_features=50) diff --git a/python-package/README.rst b/python-package/README.rst index 7f4d2d35726b..c6da99229b0e 100644 --- a/python-package/README.rst +++ b/python-package/README.rst @@ -204,6 +204,10 @@ You can use ``python setup.py bdist_wheel`` instead of ``python setup.py install Install Dask-package '''''''''''''''''''' +.. warning:: + + Dask-package is only tested on Linux. + To install all additional dependencies required for Dask-package, you can append ``[dask]`` to LightGBM package name: .. code:: sh