From 465874ebfbcf0155f1c905355ac17a0c3e9d76a2 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 8 Feb 2021 22:39:31 -0600 Subject: [PATCH 1/7] got fit() working --- python-package/lightgbm/dask.py | 62 +++++++- python-package/lightgbm/sklearn.py | 234 ++++++++++++++++------------- 2 files changed, 184 insertions(+), 112 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index a62f552b60dd..c3b2db11d017 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -19,7 +19,7 @@ from .compat import (PANDAS_INSTALLED, pd_DataFrame, pd_Series, concat, SKLEARN_INSTALLED, LGBMNotFittedError, DASK_INSTALLED, dask_DataFrame, dask_Array, dask_Series, delayed, Client, default_client, get_worker, wait) -from .sklearn import LGBMClassifier, LGBMModel, LGBMRegressor, LGBMRanker +from .sklearn import _lgbmmodel_doc_fit, LGBMClassifier, LGBMModel, LGBMRegressor, LGBMRanker _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series] _DaskMatrixLike = Union[dask_Array, dask_DataFrame] @@ -604,7 +604,24 @@ def fit( **kwargs ) - fit.__doc__ = LGBMClassifier.fit.__doc__ + _base_doc = _lgbmmodel_doc_fit.format( + X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + y_shape="dask Array, dask DataFrame or dask Series of shape = [n_samples]", + sample_weight_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", + group_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" + ) + + # DaskLGBMClassifier does not support init_score, evaluation data, + # or early stopping + _base_doc = (_base_doc[:_base_doc.find('init_score :')] + + _base_doc[_base_doc.find('verbose :'):]) + + # DaskLGBMClassifier support for callbacks and init_model is not tested + fit.__doc__ = ( + _base_doc[:_base_doc.find('callbacks :')] + + '**kwargs\n' + + ' ' * 12 + 'Other parameters passed through to ``LGBMClassifier.fit()``\n' + ) def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMClassifier.predict.""" @@ -721,7 +738,24 @@ def fit( **kwargs ) - fit.__doc__ = LGBMRegressor.fit.__doc__ + _base_doc = _lgbmmodel_doc_fit.format( + X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + y_shape="dask Array, dask DataFrame or dask Series of shape = [n_samples]", + sample_weight_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", + group_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" + ) + + # DaskLGBMRegressor does not support init_score, evaluation data, + # or early stopping + _base_doc = (_base_doc[:_base_doc.find('init_score :')] + + _base_doc[_base_doc.find('verbose :'):]) + + # DaskLGBMRegressor support for callbacks and init_model is not tested + fit.__doc__ = ( + _base_doc[:_base_doc.find('callbacks :')] + + '**kwargs\n' + + ' ' * 12 + 'Other parameters passed through to ``LGBMRegressor.fit()``\n' + ) def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMRegressor.predict.""" @@ -832,7 +866,27 @@ def fit( **kwargs ) - fit.__doc__ = LGBMRanker.fit.__doc__ + _base_doc = _lgbmmodel_doc_fit.format( + X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + y_shape="dask Array, dask DataFrame or dask Series of shape = [n_samples]", + sample_weight_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", + group_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" + ) + + # DaskLGBMRanker does not support init_score, evaluation data, + # or early stopping + _base_doc = (_base_doc[:_base_doc.find('init_score :')] + + _base_doc[_base_doc.find('init_score :'):]) + + _base_doc = (_base_doc[:_base_doc.find('eval_set :')] + + _base_doc[_base_doc.find('verbose :'):]) + + # DaskLGBMRanker support for callbacks and init_model is not tested + fit.__doc__ = ( + _base_doc[:_base_doc.find('callbacks :')] + + '**kwargs\n' + + ' ' * 12 + 'Other parameters passed through to ``LGBMRanker.fit()``\n' + ) def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMRanker.predict.""" diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 6a48f2c7b7d5..340a4d1a7b26 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -176,6 +176,125 @@ def __call__(self, preds, dataset): raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc) +# documentation for LGBMModel methods is shared between the classes here +# and those in the ``dask`` module + +_lgbmmodel_doc_fit = ( + """ + Build a gradient boosting model from the training set (X, y). + + Parameters + ---------- + X : {X_shape} + Input feature matrix. + y : {y_shape} + The target values (class labels in classification, real numbers in regression). + sample_weight : {sample_weight_shape} + Weights of training data. + init_score : array-like of shape = [n_samples] or None, optional (default=None) + Init score of training data. + group : {group_shape} + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + eval_set : list or None, optional (default=None) + A list of (X, y) tuple pairs to use as validation sets. + eval_names : list of strings or None, optional (default=None) + Names of eval_set. + eval_sample_weight : list of arrays or None, optional (default=None) + Weights of eval data. + eval_class_weight : list or None, optional (default=None) + Class weights of eval data. + eval_init_score : list of arrays or None, optional (default=None) + Init score of eval data. + eval_group : list of arrays or None, optional (default=None) + Group data of eval data. + eval_metric : string, callable, list or None, optional (default=None) + If string, it should be a built-in evaluation metric to use. + If callable, it should be a custom evaluation metric, see note below for more details. + If list, it can be a list of built-in metrics, a list of custom evaluation metrics, or a mix of both. + In either case, the ``metric`` from the model parameters will be evaluated and used as well. + Default: 'l2' for LGBMRegressor, 'logloss' for LGBMClassifier, 'ndcg' for LGBMRanker. + early_stopping_rounds : int or None, optional (default=None) + Activates early stopping. The model will train until the validation score stops improving. + Validation score needs to improve at least every ``early_stopping_rounds`` round(s) + to continue training. + Requires at least one validation data and one metric. + If there's more than one, will check all of them. But the training data is ignored anyway. + To check only the first metric, set the ``first_metric_only`` parameter to ``True`` + in additional parameters ``**kwargs`` of the model constructor. + verbose : bool or int, optional (default=True) + Requires at least one evaluation data. + If True, the eval metric on the eval set is printed at each boosting stage. + If int, the eval metric on the eval set is printed at every ``verbose`` boosting stage. + The last boosting stage or the boosting stage found by using ``early_stopping_rounds`` is also printed. + + .. rubric:: Example + + With ``verbose`` = 4 and at least one item in ``eval_set``, + an evaluation metric is printed every 4 (instead of 1) boosting stages. + + feature_name : list of strings or 'auto', optional (default='auto') + Feature names. + If 'auto' and data is pandas DataFrame, data columns names are used. + categorical_feature : list of strings or int, or 'auto', optional (default='auto') + Categorical features. + If list of int, interpreted as indices. + If list of strings, interpreted as feature names (need to specify ``feature_name`` as well). + If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. + All values in categorical features should be less than int32 max value (2147483647). + Large values could be memory consuming. Consider using consecutive integers starting from zero. + All negative values in categorical features will be treated as missing values. + The output cannot be monotonically constrained with respect to a categorical feature. + callbacks : list of callback functions or None, optional (default=None) + List of callback functions that are applied at each iteration. + See Callbacks in Python API for more information. + init_model : string, Booster, LGBMModel or None, optional (default=None) + Filename of LightGBM model, Booster instance or LGBMModel instance used for continue training. + + Returns + ------- + self : object + Returns self. + """ +) + +_lgbmmodel_doc_custom_eval_note = """ + Note + ---- + Custom eval function expects a callable with following signatures: + ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or + ``func(y_true, y_pred, weight, group)`` + and returns (eval_name, eval_result, is_higher_better) or + list of (eval_name, eval_result, is_higher_better): + + y_true : array-like of shape = [n_samples] + The target values. + y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) + The predicted values. + weight : array-like of shape = [n_samples] + The weight of samples. + group : array-like + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + eval_name : string + The name of evaluation function (without whitespaces). + eval_result : float + The eval result. + is_higher_better : bool + Is eval result higher better, e.g. AUC is ``is_higher_better``. + + For binary task, the y_pred is probability of positive class (or margin in case of custom ``objective``). + For multi-class task, the y_pred is group by class_id first, then group by row_id. + If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]. +""" + + class LGBMModel(_LGBMModelBase): """Implementation of the scikit-learn API for LightGBM.""" @@ -382,115 +501,7 @@ def fit(self, X, y, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None, init_model=None): - """Build a gradient boosting model from the training set (X, y). - Parameters - ---------- - X : array-like or sparse matrix of shape = [n_samples, n_features] - Input feature matrix. - y : array-like of shape = [n_samples] - The target values (class labels in classification, real numbers in regression). - sample_weight : array-like of shape = [n_samples] or None, optional (default=None) - Weights of training data. - init_score : array-like of shape = [n_samples] or None, optional (default=None) - Init score of training data. - group : array-like or None, optional (default=None) - Group/query data. - Only used in the learning-to-rank task. - sum(group) = n_samples. - For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, - where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - eval_set : list or None, optional (default=None) - A list of (X, y) tuple pairs to use as validation sets. - eval_names : list of strings or None, optional (default=None) - Names of eval_set. - eval_sample_weight : list of arrays or None, optional (default=None) - Weights of eval data. - eval_class_weight : list or None, optional (default=None) - Class weights of eval data. - eval_init_score : list of arrays or None, optional (default=None) - Init score of eval data. - eval_group : list of arrays or None, optional (default=None) - Group data of eval data. - eval_metric : string, callable, list or None, optional (default=None) - If string, it should be a built-in evaluation metric to use. - If callable, it should be a custom evaluation metric, see note below for more details. - If list, it can be a list of built-in metrics, a list of custom evaluation metrics, or a mix of both. - In either case, the ``metric`` from the model parameters will be evaluated and used as well. - Default: 'l2' for LGBMRegressor, 'logloss' for LGBMClassifier, 'ndcg' for LGBMRanker. - early_stopping_rounds : int or None, optional (default=None) - Activates early stopping. The model will train until the validation score stops improving. - Validation score needs to improve at least every ``early_stopping_rounds`` round(s) - to continue training. - Requires at least one validation data and one metric. - If there's more than one, will check all of them. But the training data is ignored anyway. - To check only the first metric, set the ``first_metric_only`` parameter to ``True`` - in additional parameters ``**kwargs`` of the model constructor. - verbose : bool or int, optional (default=True) - Requires at least one evaluation data. - If True, the eval metric on the eval set is printed at each boosting stage. - If int, the eval metric on the eval set is printed at every ``verbose`` boosting stage. - The last boosting stage or the boosting stage found by using ``early_stopping_rounds`` is also printed. - - .. rubric:: Example - - With ``verbose`` = 4 and at least one item in ``eval_set``, - an evaluation metric is printed every 4 (instead of 1) boosting stages. - - feature_name : list of strings or 'auto', optional (default='auto') - Feature names. - If 'auto' and data is pandas DataFrame, data columns names are used. - categorical_feature : list of strings or int, or 'auto', optional (default='auto') - Categorical features. - If list of int, interpreted as indices. - If list of strings, interpreted as feature names (need to specify ``feature_name`` as well). - If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. - All values in categorical features should be less than int32 max value (2147483647). - Large values could be memory consuming. Consider using consecutive integers starting from zero. - All negative values in categorical features will be treated as missing values. - The output cannot be monotonically constrained with respect to a categorical feature. - callbacks : list of callback functions or None, optional (default=None) - List of callback functions that are applied at each iteration. - See Callbacks in Python API for more information. - init_model : string, Booster, LGBMModel or None, optional (default=None) - Filename of LightGBM model, Booster instance or LGBMModel instance used for continue training. - - Returns - ------- - self : object - Returns self. - - Note - ---- - Custom eval function expects a callable with following signatures: - ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or - ``func(y_true, y_pred, weight, group)`` - and returns (eval_name, eval_result, is_higher_better) or - list of (eval_name, eval_result, is_higher_better): - - y_true : array-like of shape = [n_samples] - The target values. - y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) - The predicted values. - weight : array-like of shape = [n_samples] - The weight of samples. - group : array-like - Group/query data. - Only used in the learning-to-rank task. - sum(group) = n_samples. - For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, - where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - eval_name : string - The name of evaluation function (without whitespaces). - eval_result : float - The eval result. - is_higher_better : bool - Is eval result higher better, e.g. AUC is ``is_higher_better``. - - For binary task, the y_pred is probability of positive class (or margin in case of custom ``objective``). - For multi-class task, the y_pred is group by class_id first, then group by row_id. - If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]. - """ if self._objective is None: if isinstance(self, LGBMRegressor): self._objective = "regression" @@ -648,6 +659,13 @@ def _get_meta_data(collection, name, i): del train_set, valid_sets return self + fit.__doc__ = _lgbmmodel_doc_fit.format( + X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", + y_shape="array-like of shape = [n_samples]", + sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)", + group_shape="array-like or None, optional (default=None)" + ) + "\n\n" + _lgbmmodel_doc_custom_eval_note + def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None, pred_leaf=False, pred_contrib=False, **kwargs): """Return the predicted value for each sample. From 12f00062ffda71258ccb1a326fe58eced2a55ac1 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 8 Feb 2021 23:06:23 -0600 Subject: [PATCH 2/7] add predict() --- python-package/lightgbm/dask.py | 30 ++++++++-- python-package/lightgbm/sklearn.py | 96 +++++++++++++++++------------- 2 files changed, 80 insertions(+), 46 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index c3b2db11d017..af453ba03efd 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -19,7 +19,14 @@ from .compat import (PANDAS_INSTALLED, pd_DataFrame, pd_Series, concat, SKLEARN_INSTALLED, LGBMNotFittedError, DASK_INSTALLED, dask_DataFrame, dask_Array, dask_Series, delayed, Client, default_client, get_worker, wait) -from .sklearn import _lgbmmodel_doc_fit, LGBMClassifier, LGBMModel, LGBMRegressor, LGBMRanker +from .sklearn import ( + _lgbmmodel_doc_fit, + _lgbmmodel_doc_predict, + LGBMClassifier, + LGBMModel, + LGBMRegressor, + LGBMRanker +) _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series] _DaskMatrixLike = Union[dask_Array, dask_DataFrame] @@ -632,7 +639,12 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: **kwargs ) - predict.__doc__ = LGBMClassifier.predict.__doc__ + predict.__doc__ = _lgbmmodel_doc_predict.format( + X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + predicted_result_shape="dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" + ) def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMClassifier.predict_proba.""" @@ -765,7 +777,12 @@ def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array: **kwargs ) - predict.__doc__ = LGBMRegressor.predict.__doc__ + predict.__doc__ = _lgbmmodel_doc_predict.format( + X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + predicted_result_shape="dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" + ) def to_local(self) -> LGBMRegressor: """Create regular version of lightgbm.LGBMRegressor from the distributed version. @@ -892,7 +909,12 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMRanker.predict.""" return _predict(self.to_local(), X, **kwargs) - predict.__doc__ = LGBMRanker.predict.__doc__ + predict.__doc__ = _lgbmmodel_doc_predict.format( + X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + predicted_result_shape="dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" + ) def to_local(self) -> LGBMRanker: """Create regular version of lightgbm.LGBMRanker from the distributed version. diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 340a4d1a7b26..2aeb11800391 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -176,8 +176,8 @@ def __call__(self, preds, dataset): raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc) -# documentation for LGBMModel methods is shared between the classes here -# and those in the ``dask`` module +# documentation templates for LGBMModel methods are shared between the classes in +# this module and those in the ``dask`` module _lgbmmodel_doc_fit = ( """ @@ -294,6 +294,51 @@ def __call__(self, preds, dataset): If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]. """ +_lgbmmodel_doc_predict = ( + """ + Return the predicted value for each sample. + + Parameters + ---------- + X : {X_shape} + Input features matrix. + raw_score : bool, optional (default=False) + Whether to predict raw scores. + start_iteration : int, optional (default=0) + Start index of the iteration to predict. + If <= 0, starts from the first iteration. + num_iteration : int or None, optional (default=None) + Total number of iterations used in the prediction. + If None, if the best iteration exists and start_iteration <= 0, the best iteration is used; + otherwise, all iterations from ``start_iteration`` are used (no limits). + If <= 0, all iterations from ``start_iteration`` are used (no limits). + pred_leaf : bool, optional (default=False) + Whether to predict leaf index. + pred_contrib : bool, optional (default=False) + Whether to predict feature contributions. + + .. note:: + + If you want to get more explanations for your model's predictions using SHAP values, + like SHAP interaction values, + you can install the shap package (https://github.com/slundberg/shap). + Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra + column, where the last column is the expected value. + + **kwargs + Other parameters for the prediction. + + Returns + ------- + predicted_result : {predicted_result_shape} + The predicted values. + X_leaves : {X_leaves_shape} + If ``pred_leaf=True``, the predicted leaf of every tree for each sample. + X_SHAP_values : {X_SHAP_values_shape} + If ``pred_contrib=True``, the feature contributions for each sample. + """ +) + class LGBMModel(_LGBMModelBase): """Implementation of the scikit-learn API for LightGBM.""" @@ -668,47 +713,7 @@ def _get_meta_data(collection, name, i): def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None, pred_leaf=False, pred_contrib=False, **kwargs): - """Return the predicted value for each sample. - Parameters - ---------- - X : array-like or sparse matrix of shape = [n_samples, n_features] - Input features matrix. - raw_score : bool, optional (default=False) - Whether to predict raw scores. - start_iteration : int, optional (default=0) - Start index of the iteration to predict. - If <= 0, starts from the first iteration. - num_iteration : int or None, optional (default=None) - Total number of iterations used in the prediction. - If None, if the best iteration exists and start_iteration <= 0, the best iteration is used; - otherwise, all iterations from ``start_iteration`` are used (no limits). - If <= 0, all iterations from ``start_iteration`` are used (no limits). - pred_leaf : bool, optional (default=False) - Whether to predict leaf index. - pred_contrib : bool, optional (default=False) - Whether to predict feature contributions. - - .. note:: - - If you want to get more explanations for your model's predictions using SHAP values, - like SHAP interaction values, - you can install the shap package (https://github.com/slundberg/shap). - Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra - column, where the last column is the expected value. - - **kwargs - Other parameters for the prediction. - - Returns - ------- - predicted_result : array-like of shape = [n_samples] or shape = [n_samples, n_classes] - The predicted values. - X_leaves : array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes] - If ``pred_leaf=True``, the predicted leaf of every tree for each sample. - X_SHAP_values : array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects - If ``pred_contrib=True``, the feature contributions for each sample. - """ if self._n_features is None: raise LGBMNotFittedError("Estimator not fitted, call `fit` before exploiting the model.") if not isinstance(X, (pd_DataFrame, dt_DataTable)): @@ -722,6 +727,13 @@ def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None, return self._Booster.predict(X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration, pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs) + predict.__doc__ = _lgbmmodel_doc_predict.format( + X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", + predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects" + ) + @property def n_features_(self): """:obj:`int`: The number of features of fitted model.""" From 4f47f8733944722ac24958b2766bc5510cda6784 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 8 Feb 2021 23:31:02 -0600 Subject: [PATCH 3/7] predict_proba() --- python-package/lightgbm/dask.py | 24 +++++++++---- python-package/lightgbm/sklearn.py | 55 +++++++----------------------- 2 files changed, 30 insertions(+), 49 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index af453ba03efd..072732a11dc2 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -618,8 +618,7 @@ def fit( group_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" ) - # DaskLGBMClassifier does not support init_score, evaluation data, - # or early stopping + # DaskLGBMClassifier does not support init_score, evaluation data, or early stopping _base_doc = (_base_doc[:_base_doc.find('init_score :')] + _base_doc[_base_doc.find('verbose :'):]) @@ -640,7 +639,9 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: ) predict.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted value for each sample.", X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + output_name="predicted_result", predicted_result_shape="dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" @@ -655,7 +656,14 @@ def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: **kwargs ) - predict_proba.__doc__ = LGBMClassifier.predict_proba.__doc__ + predict_proba.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted probability for each class for each sample.", + X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + output_name="predicted_probability", + predicted_result_shape="dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" + ) def to_local(self) -> LGBMClassifier: """Create regular version of lightgbm.LGBMClassifier from the distributed version. @@ -757,8 +765,7 @@ def fit( group_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" ) - # DaskLGBMRegressor does not support init_score, evaluation data, - # or early stopping + # DaskLGBMRegressor does not support init_score, evaluation data, or early stopping _base_doc = (_base_doc[:_base_doc.find('init_score :')] + _base_doc[_base_doc.find('verbose :'):]) @@ -778,7 +785,9 @@ def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array: ) predict.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted value for each sample.", X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + output_name="predicted_result", predicted_result_shape="dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" @@ -890,8 +899,7 @@ def fit( group_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" ) - # DaskLGBMRanker does not support init_score, evaluation data, - # or early stopping + # DaskLGBMRanker does not support init_score, evaluation data, or early stopping _base_doc = (_base_doc[:_base_doc.find('init_score :')] + _base_doc[_base_doc.find('init_score :'):]) @@ -910,7 +918,9 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: return _predict(self.to_local(), X, **kwargs) predict.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted value for each sample.", X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + output_name="predicted_result", predicted_result_shape="dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 2aeb11800391..c9ff03d6511b 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -296,7 +296,7 @@ def __call__(self, preds, dataset): _lgbmmodel_doc_predict = ( """ - Return the predicted value for each sample. + {description} Parameters ---------- @@ -330,7 +330,7 @@ def __call__(self, preds, dataset): Returns ------- - predicted_result : {predicted_result_shape} + {output_name} : {predicted_result_shape} The predicted values. X_leaves : {X_leaves_shape} If ``pred_leaf=True``, the predicted leaf of every tree for each sample. @@ -728,7 +728,9 @@ def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None, pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs) predict.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted value for each sample.", X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", + output_name="predicted_result", predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects" @@ -915,47 +917,7 @@ def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None, def predict_proba(self, X, raw_score=False, start_iteration=0, num_iteration=None, pred_leaf=False, pred_contrib=False, **kwargs): - """Return the predicted probability for each class for each sample. - Parameters - ---------- - X : array-like or sparse matrix of shape = [n_samples, n_features] - Input features matrix. - raw_score : bool, optional (default=False) - Whether to predict raw scores. - start_iteration : int, optional (default=0) - Start index of the iteration to predict. - If <= 0, starts from the first iteration. - num_iteration : int or None, optional (default=None) - Total number of iterations used in the prediction. - If None, if the best iteration exists and start_iteration <= 0, the best iteration is used; - otherwise, all iterations from ``start_iteration`` are used (no limits). - If <= 0, all iterations from ``start_iteration`` are used (no limits). - pred_leaf : bool, optional (default=False) - Whether to predict leaf index. - pred_contrib : bool, optional (default=False) - Whether to predict feature contributions. - - .. note:: - - If you want to get more explanations for your model's predictions using SHAP values, - like SHAP interaction values, - you can install the shap package (https://github.com/slundberg/shap). - Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra - column, where the last column is the expected value. - - **kwargs - Other parameters for the prediction. - - Returns - ------- - predicted_probability : array-like of shape = [n_samples, n_classes] - The predicted probability for each class for each sample. - X_leaves : array-like of shape = [n_samples, n_trees * n_classes] - If ``pred_leaf=True``, the predicted leaf of every tree for each sample. - X_SHAP_values : array-like of shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects - If ``pred_contrib=True``, the feature contributions for each sample. - """ result = super().predict(X, raw_score, start_iteration, num_iteration, pred_leaf, pred_contrib, **kwargs) if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib): _log_warning("Cannot compute class probabilities or labels " @@ -967,6 +929,15 @@ def predict_proba(self, X, raw_score=False, start_iteration=0, num_iteration=Non else: return np.vstack((1. - result, result)).transpose() + predict_proba.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted probability for each class for each sample.", + X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", + output_name="predicted_probability", + predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects" + ) + @property def classes_(self): """:obj:`array` of shape = [n_classes]: The class label array.""" From 9f9bb8e8ab8acabcefc7393e7529fa0fc3898ef2 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 9 Feb 2021 00:03:54 -0600 Subject: [PATCH 4/7] remove custom objective docs --- python-package/lightgbm/dask.py | 18 +++++++++++++++--- python-package/lightgbm/sklearn.py | 6 +++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 072732a11dc2..51aff5f492ed 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -585,13 +585,17 @@ def __init__( _base_doc = LGBMClassifier.__init__.__doc__ _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') - __init__.__doc__ = ( + _base_doc = ( _before_kwargs + 'client : dask.distributed.Client or None, optional (default=None)\n' + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n' + ' ' * 8 + _kwargs + _after_kwargs ) + # the note on custom objective functions in LGBMModel.__init__ is not + # currently relevant for the Dask estimators + __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')] + def __getstate__(self) -> Dict[Any, Any]: return self._lgb_getstate() @@ -732,13 +736,17 @@ def __init__( _base_doc = LGBMRegressor.__init__.__doc__ _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') - __init__.__doc__ = ( + _base_doc = ( _before_kwargs + 'client : dask.distributed.Client or None, optional (default=None)\n' + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n' + ' ' * 8 + _kwargs + _after_kwargs ) + # the note on custom objective functions in LGBMModel.__init__ is not + # currently relevant for the Dask estimators + __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')] + def __getstate__(self) -> Dict[Any, Any]: return self._lgb_getstate() @@ -860,13 +868,17 @@ def __init__( _base_doc = LGBMRanker.__init__.__doc__ _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') - __init__.__doc__ = ( + _base_doc = ( _before_kwargs + 'client : dask.distributed.Client or None, optional (default=None)\n' + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n' + ' ' * 8 + _kwargs + _after_kwargs ) + # the note on custom objective functions in LGBMModel.__init__ is not + # currently relevant for the Dask estimators + __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')] + def __getstate__(self) -> Dict[Any, Any]: return self._lgb_getstate() diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index c9ff03d6511b..3b5fc01648e6 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -546,7 +546,7 @@ def fit(self, X, y, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None, init_model=None): - + """Docstring is set after definition, using a template.""" if self._objective is None: if isinstance(self, LGBMRegressor): self._objective = "regression" @@ -713,7 +713,7 @@ def _get_meta_data(collection, name, i): def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None, pred_leaf=False, pred_contrib=False, **kwargs): - + """Docstring is set after definition, using a template.""" if self._n_features is None: raise LGBMNotFittedError("Estimator not fitted, call `fit` before exploiting the model.") if not isinstance(X, (pd_DataFrame, dt_DataTable)): @@ -917,7 +917,7 @@ def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None, def predict_proba(self, X, raw_score=False, start_iteration=0, num_iteration=None, pred_leaf=False, pred_contrib=False, **kwargs): - + """Docstring is set after definition, using a template.""" result = super().predict(X, raw_score, start_iteration, num_iteration, pred_leaf, pred_contrib, **kwargs) if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib): _log_warning("Cannot compute class probabilities or labels " From 7fdb0bbe1261c9f92cb34f699ee7be2e21ab7d5d Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 9 Feb 2021 11:46:03 -0600 Subject: [PATCH 5/7] Apply suggestions from code review Co-authored-by: Nikita Titov --- python-package/lightgbm/dask.py | 14 +++++++------- python-package/lightgbm/sklearn.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 51aff5f492ed..7c128579dfd6 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -664,7 +664,7 @@ def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: description="Return the predicted probability for each class for each sample.", X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", output_name="predicted_probability", - predicted_result_shape="dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", + predicted_result_shape="dask Array of shape = [n_samples, n_classes]", X_leaves_shape="dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" ) @@ -796,9 +796,9 @@ def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array: description="Return the predicted value for each sample.", X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", output_name="predicted_result", - predicted_result_shape="dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", - X_leaves_shape="dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", - X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" + predicted_result_shape="dask Array of shape = [n_samples]", + X_leaves_shape="dask Array of shape = [n_samples, n_trees]", + X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1]" ) def to_local(self) -> LGBMRegressor: @@ -933,9 +933,9 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: description="Return the predicted value for each sample.", X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", output_name="predicted_result", - predicted_result_shape="dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", - X_leaves_shape="dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", - X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" + predicted_result_shape="dask Array of shape = [n_samples]", + X_leaves_shape="dask Array of shape = [n_samples, n_trees]", + X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1]" ) def to_local(self) -> LGBMRanker: diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 3b5fc01648e6..ce5abe24f8d4 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -933,7 +933,7 @@ def predict_proba(self, X, raw_score=False, start_iteration=0, num_iteration=Non description="Return the predicted probability for each class for each sample.", X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", output_name="predicted_probability", - predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", + predicted_result_shape="array-like of shape = [n_samples, n_classes]", X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects" ) From 8cceecf2f0b184f22559da58c8c12891e8bcd6e3 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 9 Feb 2021 13:26:38 -0600 Subject: [PATCH 6/7] fix capitalization --- python-package/lightgbm/dask.py | 78 +++++++++++++------------- tests/python_package_test/test_dask.py | 4 +- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 7c128579dfd6..b381789e37d3 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -1,8 +1,8 @@ # coding: utf-8 -"""Distributed training with LightGBM and Dask.distributed. +"""Distributed training with LightGBM and dask.distributed. This module enables you to perform distributed training with LightGBM on -Dask.Array and Dask.DataFrame collections. +dask.Array and dask.DataFrame collections. It is based on dask-lightgbm, which was based on dask-xgboost. """ @@ -223,17 +223,17 @@ def _train( ---------- client : dask.distributed.Client Dask client. - data : dask Array or dask DataFrame of shape = [n_samples, n_features] + data : Dask Array or Dask DataFrame of shape = [n_samples, n_features] Input feature matrix. - label : dask Array, dask DataFrame or dask Series of shape = [n_samples] + label : Dask Array, Dask DataFrame or Dask Series of shape = [n_samples] The target values (class labels in classification, real numbers in regression). params : dict Parameters passed to constructor of the local underlying model. model_factory : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class Class of the local underlying model. - sample_weight : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None) + sample_weight : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None) Weights of training data. - group : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None) + group : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None) Group/query data. Only used in the learning-to-rank task. sum(group) = n_samples. @@ -403,7 +403,7 @@ def _predict( ---------- model : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class Fitted underlying model. - data : dask Array or dask DataFrame of shape = [n_samples, n_features] + data : Dask Array or Dask DataFrame of shape = [n_samples, n_features] Input feature matrix. raw_score : bool, optional (default=False) Whether to predict raw scores. @@ -420,11 +420,11 @@ def _predict( Returns ------- - predicted_result : dask Array of shape = [n_samples] or shape = [n_samples, n_classes] + predicted_result : Dask Array of shape = [n_samples] or shape = [n_samples, n_classes] The predicted values. - X_leaves : dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes] + X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes] If ``pred_leaf=True``, the predicted leaf of every tree for each sample. - X_SHAP_values : dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] + X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] If ``pred_contrib=True``, the feature contributions for each sample. """ if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): @@ -455,7 +455,7 @@ def _predict( **kwargs ) else: - raise TypeError('Data must be either dask Array or dask DataFrame. Got %s.' % str(type(data))) + raise TypeError('Data must be either Dask Array or Dask DataFrame. Got %s.' % str(type(data))) class _DaskLGBMModel: @@ -616,10 +616,10 @@ def fit( ) _base_doc = _lgbmmodel_doc_fit.format( - X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", - y_shape="dask Array, dask DataFrame or dask Series of shape = [n_samples]", - sample_weight_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", - group_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", + sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", + group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" ) # DaskLGBMClassifier does not support init_score, evaluation data, or early stopping @@ -644,11 +644,11 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: predict.__doc__ = _lgbmmodel_doc_predict.format( description="Return the predicted value for each sample.", - X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", output_name="predicted_result", - predicted_result_shape="dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", - X_leaves_shape="dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", - X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" + predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" ) def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: @@ -662,11 +662,11 @@ def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: predict_proba.__doc__ = _lgbmmodel_doc_predict.format( description="Return the predicted probability for each class for each sample.", - X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", output_name="predicted_probability", - predicted_result_shape="dask Array of shape = [n_samples, n_classes]", - X_leaves_shape="dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", - X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" + predicted_result_shape="Dask Array of shape = [n_samples, n_classes]", + X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" ) def to_local(self) -> LGBMClassifier: @@ -767,10 +767,10 @@ def fit( ) _base_doc = _lgbmmodel_doc_fit.format( - X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", - y_shape="dask Array, dask DataFrame or dask Series of shape = [n_samples]", - sample_weight_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", - group_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", + sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", + group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" ) # DaskLGBMRegressor does not support init_score, evaluation data, or early stopping @@ -794,11 +794,11 @@ def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array: predict.__doc__ = _lgbmmodel_doc_predict.format( description="Return the predicted value for each sample.", - X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", output_name="predicted_result", - predicted_result_shape="dask Array of shape = [n_samples]", - X_leaves_shape="dask Array of shape = [n_samples, n_trees]", - X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1]" + predicted_result_shape="Dask Array of shape = [n_samples]", + X_leaves_shape="Dask Array of shape = [n_samples, n_trees]", + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]" ) def to_local(self) -> LGBMRegressor: @@ -905,10 +905,10 @@ def fit( ) _base_doc = _lgbmmodel_doc_fit.format( - X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", - y_shape="dask Array, dask DataFrame or dask Series of shape = [n_samples]", - sample_weight_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", - group_shape="dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", + sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", + group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" ) # DaskLGBMRanker does not support init_score, evaluation data, or early stopping @@ -931,11 +931,11 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: predict.__doc__ = _lgbmmodel_doc_predict.format( description="Return the predicted value for each sample.", - X_shape="dask Array or dask DataFrame of shape = [n_samples, n_features]", + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", output_name="predicted_result", - predicted_result_shape="dask Array of shape = [n_samples]", - X_leaves_shape="dask Array of shape = [n_samples, n_trees]", - X_SHAP_values_shape="dask Array of shape = [n_samples, n_features + 1]" + predicted_result_shape="Dask Array of shape = [n_samples]", + X_leaves_shape="Dask Array of shape = [n_samples, n_trees]", + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]" ) def to_local(self) -> LGBMRanker: diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index c441466b3151..239cacb1e22b 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -575,7 +575,7 @@ def test_ranker(output, client, listen_port, group): group=group, ) - # rebalance small dask.array dataset for better performance. + # rebalance small Dask.array dataset for better performance. if output == 'array': dX = dX.persist() dy = dy.persist() @@ -584,7 +584,7 @@ def test_ranker(output, client, listen_port, group): _ = wait([dX, dy, dw, dg]) client.rebalance() - # use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of + # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210. params = { "random_state": 42, From 56f11c1ac83409263dfffac16a92965fd4eafda3 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 9 Feb 2021 14:00:36 -0600 Subject: [PATCH 7/7] Update tests/python_package_test/test_dask.py Co-authored-by: Nikita Titov --- tests/python_package_test/test_dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 239cacb1e22b..6ed56e87f800 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -575,7 +575,7 @@ def test_ranker(output, client, listen_port, group): group=group, ) - # rebalance small Dask.array dataset for better performance. + # rebalance small dask.Array dataset for better performance. if output == 'array': dX = dX.persist() dy = dy.persist()