diff --git a/README.md b/README.md index 3839ee5af824..fd2c47884810 100644 --- a/README.md +++ b/README.md @@ -87,8 +87,6 @@ ML.NET (.NET/C#-package): https://github.com/dotnet/machinelearning LightGBM.NET (.NET/C#-package): https://github.com/rca22/LightGBM.Net -Dask-LightGBM (distributed and parallel Python-package): https://github.com/dask/dask-lightgbm - Ruby gem: https://github.com/ankane/lightgbm LightGBM4j (Java high-level binding): https://github.com/metarank/lightgbm4j diff --git a/docs/FAQ.rst b/docs/FAQ.rst index e2c7ad9aa766..e90776347b01 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -24,7 +24,7 @@ You may also ping a member of the core team according to the relevant area of ex - `@chivee `__ **Qiwei Ye** (C++ code / Python-package) - `@btrotta `__ **Belinda Trotta** (C++ code) - `@Laurae2 `__ **Damien Soukhavong** (R-package) -- `@jameslamb `__ **James Lamb** (R-package) +- `@jameslamb `__ **James Lamb** (R-package / Dask-package) - `@wxchan `__ **Wenxuan Chen** (Python-package) - `@henry0312 `__ **Tsukasa Omoto** (Python-package) - `@StrikerRUS `__ **Nikita Titov** (Python-package) diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst index 3cd57086447d..6dde5d8fbf03 100644 --- a/docs/Parallel-Learning-Guide.rst +++ b/docs/Parallel-Learning-Guide.rst @@ -7,7 +7,7 @@ Follow the `Quick Start <./Quick-Start.rst>`__ to know how to use LightGBM first **List of external libraries in which LightGBM can be used in a distributed fashion** -- `Dask-LightGBM`_ allows to create ML workflow on Dask distributed data structures. +- `Dask API of LightGBM <./Python-API.rst#dask-api>`__ (formerly it was a separate package) allows to create ML workflow on Dask distributed data structures. - `MMLSpark`_ integrates LightGBM into Apache Spark ecosystem. `The following example`_ demonstrates how easy it's possible to utilize the great power of Spark. @@ -134,8 +134,6 @@ Example - `A simple parallel example`_ -.. _Dask-LightGBM: https://github.com/dask/dask-lightgbm - .. _MMLSpark: https://aka.ms/spark .. _The following example: https://github.com/Azure/mmlspark/blob/master/notebooks/samples/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb diff --git a/docs/Python-API.rst b/docs/Python-API.rst index ef249ad4c700..5dee4583b41c 100644 --- a/docs/Python-API.rst +++ b/docs/Python-API.rst @@ -33,6 +33,16 @@ Scikit-learn API LGBMRegressor LGBMRanker +Dask API +-------- + +.. autosummary:: + :toctree: pythonapi/ + + DaskLGBMClassifier + DaskLGBMRegressor + DaskLGBMRanker + Callbacks --------- diff --git a/docs/conf.py b/docs/conf.py index b84321b955ee..a66be6df8ccf 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -39,7 +39,7 @@ # -- mock out modules MOCK_MODULES = ['numpy', 'scipy', 'scipy.sparse', - 'sklearn', 'matplotlib', 'pandas', 'graphviz'] + 'sklearn', 'matplotlib', 'pandas', 'graphviz', 'dask', 'dask.distributed'] for mod_name in MOCK_MODULES: sys.modules[mod_name] = Mock() diff --git a/python-package/README.rst b/python-package/README.rst index 265f9d47053f..121def9b13f7 100644 --- a/python-package/README.rst +++ b/python-package/README.rst @@ -183,12 +183,22 @@ Run ``python setup.py install --bit32``, if you want to use 32-bit version. All If you get any errors during installation or due to any other reasons, you may want to build dynamic library from sources by any method you prefer (see `Installation Guide `__) and then just run ``python setup.py install --precompile``. - Build Wheel File **************** You can use ``python setup.py bdist_wheel`` instead of ``python setup.py install`` to build wheel file and use it for installation later. This might be useful for systems with restricted or completely without network access. +Install Dask-package +'''''''''''''''''''' + +To install all additional dependencies required for Dask-package, you can append ``[dask]`` to LightGBM package name: + +.. code:: sh + + pip install lightgbm[dask] + +Or replace ``python setup.py install`` with ``pip install -e .[dask]`` if you are installing the package from source files. + Troubleshooting --------------- diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py index 44f2e56679f0..c8bbb84844bf 100644 --- a/python-package/lightgbm/__init__.py +++ b/python-package/lightgbm/__init__.py @@ -19,6 +19,10 @@ plot_tree, create_tree_digraph) except ImportError: pass +try: + from .dask import DaskLGBMRegressor, DaskLGBMClassifier, DaskLGBMRanker +except ImportError: + pass dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -31,5 +35,6 @@ 'register_logger', 'train', 'cv', 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', + 'DaskLGBMRegressor', 'DaskLGBMClassifier', 'DaskLGBMRanker', 'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 'plot_importance', 'plot_split_value_histogram', 'plot_metric', 'plot_tree', 'create_tree_digraph'] diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index fa12ae2c975a..e118075775c6 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -105,3 +105,12 @@ def _check_sample_weight(sample_weight, X, dtype=None): _LGBMAssertAllFinite = None _LGBMCheckClassificationTargets = None _LGBMComputeSampleWeight = None + +"""dask""" +try: + from dask import array + from dask import dataframe + from dask.distributed import Client + DASK_INSTALLED = True +except ImportError: + DASK_INSTALLED = False diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 3fbb6183d9ee..4acbf10702d7 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -21,7 +21,8 @@ from dask import delayed from dask.distributed import Client, default_client, get_worker, wait -from .basic import _ConfigAliases, _LIB, _log_warning, _safe_call +from .basic import _ConfigAliases, _LIB, _log_warning, _safe_call, LightGBMError +from .compat import DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED from .sklearn import LGBMClassifier, LGBMRegressor, LGBMRanker @@ -393,6 +394,9 @@ def _predict(model, data, raw_score=False, pred_proba=False, pred_leaf=False, pr class _LGBMModel: + def __init__(self): + if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): + raise LightGBMError('dask, pandas and scikit-learn are required for lightgbm.dask') def _fit(self, model_factory, X, y=None, sample_weight=None, group=None, client=None, **kwargs): """Docstring is inherited from the LGBMModel.""" @@ -431,7 +435,7 @@ def _copy_extra_params(source, dest): setattr(dest, name, attributes[name]) -class DaskLGBMClassifier(_LGBMModel, LGBMClassifier): +class DaskLGBMClassifier(LGBMClassifier, _LGBMModel): """Distributed version of lightgbm.LGBMClassifier.""" def fit(self, X, y=None, sample_weight=None, client=None, **kwargs): @@ -479,7 +483,7 @@ def to_local(self): return self._to_local(LGBMClassifier) -class DaskLGBMRegressor(_LGBMModel, LGBMRegressor): +class DaskLGBMRegressor(LGBMRegressor, _LGBMModel): """Docstring is inherited from the lightgbm.LGBMRegressor.""" def fit(self, X, y=None, sample_weight=None, client=None, **kwargs): @@ -515,7 +519,7 @@ def to_local(self): return self._to_local(LGBMRegressor) -class DaskLGBMRanker(_LGBMModel, LGBMRanker): +class DaskLGBMRanker(LGBMRanker, _LGBMModel): """Docstring is inherited from the lightgbm.LGBMRanker.""" def fit(self, X, y=None, sample_weight=None, init_score=None, group=None, client=None, **kwargs): diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 51f1b7e6e9df..478b1efac33b 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -334,7 +334,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi "xe_ndcg", "xe_ndcg_mart", "xendcg_mart"} for obj_alias in _ConfigAliases.get("objective")): if not SKLEARN_INSTALLED: - raise LightGBMError('Scikit-learn is required for ranking cv.') + raise LightGBMError('scikit-learn is required for ranking cv') # ranking task, split according to groups group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False) flatted_group = np.repeat(range(len(group_info)), repeats=group_info) @@ -342,7 +342,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group) elif stratified: if not SKLEARN_INSTALLED: - raise LightGBMError('Scikit-learn is required for stratified cv.') + raise LightGBMError('scikit-learn is required for stratified cv') skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed) folds = skf.split(X=np.zeros(num_data), y=full_data.get_label()) else: diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 9fa930c906f0..96efef17dc29 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -289,7 +289,7 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1, and you should group grad and hess in this way as well. """ if not SKLEARN_INSTALLED: - raise LightGBMError('Scikit-learn is required for this module') + raise LightGBMError('scikit-learn is required for lightgbm.sklearn') self.boosting_type = boosting_type self.objective = objective diff --git a/python-package/setup.py b/python-package/setup.py index 3e545c490682..08cc7d0c0b2b 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -344,7 +344,7 @@ def run(self): extras_require={ 'dask': [ 'dask[array]>=2.0.0', - 'dask[dataframe]>=2.0.0' + 'dask[dataframe]>=2.0.0', 'dask[distributed]>=2.0.0', 'pandas', ],