diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index a62f552b60dd..b381789e37d3 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -1,8 +1,8 @@ # coding: utf-8 -"""Distributed training with LightGBM and Dask.distributed. +"""Distributed training with LightGBM and dask.distributed. This module enables you to perform distributed training with LightGBM on -Dask.Array and Dask.DataFrame collections. +dask.Array and dask.DataFrame collections. It is based on dask-lightgbm, which was based on dask-xgboost. """ @@ -19,7 +19,14 @@ from .compat import (PANDAS_INSTALLED, pd_DataFrame, pd_Series, concat, SKLEARN_INSTALLED, LGBMNotFittedError, DASK_INSTALLED, dask_DataFrame, dask_Array, dask_Series, delayed, Client, default_client, get_worker, wait) -from .sklearn import LGBMClassifier, LGBMModel, LGBMRegressor, LGBMRanker +from .sklearn import ( + _lgbmmodel_doc_fit, + _lgbmmodel_doc_predict, + LGBMClassifier, + LGBMModel, + LGBMRegressor, + LGBMRanker +) _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series] _DaskMatrixLike = Union[dask_Array, dask_DataFrame] @@ -216,17 +223,17 @@ def _train( ---------- client : dask.distributed.Client Dask client. - data : dask Array or dask DataFrame of shape = [n_samples, n_features] + data : Dask Array or Dask DataFrame of shape = [n_samples, n_features] Input feature matrix. - label : dask Array, dask DataFrame or dask Series of shape = [n_samples] + label : Dask Array, Dask DataFrame or Dask Series of shape = [n_samples] The target values (class labels in classification, real numbers in regression). params : dict Parameters passed to constructor of the local underlying model. model_factory : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class Class of the local underlying model. - sample_weight : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None) + sample_weight : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None) Weights of training data. - group : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None) + group : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None) Group/query data. Only used in the learning-to-rank task. sum(group) = n_samples. @@ -396,7 +403,7 @@ def _predict( ---------- model : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class Fitted underlying model. - data : dask Array or dask DataFrame of shape = [n_samples, n_features] + data : Dask Array or Dask DataFrame of shape = [n_samples, n_features] Input feature matrix. raw_score : bool, optional (default=False) Whether to predict raw scores. @@ -413,11 +420,11 @@ def _predict( Returns ------- - predicted_result : dask Array of shape = [n_samples] or shape = [n_samples, n_classes] + predicted_result : Dask Array of shape = [n_samples] or shape = [n_samples, n_classes] The predicted values. - X_leaves : dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes] + X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes] If ``pred_leaf=True``, the predicted leaf of every tree for each sample. - X_SHAP_values : dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] + X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] If ``pred_contrib=True``, the feature contributions for each sample. """ if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): @@ -448,7 +455,7 @@ def _predict( **kwargs ) else: - raise TypeError('Data must be either dask Array or dask DataFrame. Got %s.' % str(type(data))) + raise TypeError('Data must be either Dask Array or Dask DataFrame. Got %s.' % str(type(data))) class _DaskLGBMModel: @@ -578,13 +585,17 @@ def __init__( _base_doc = LGBMClassifier.__init__.__doc__ _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') - __init__.__doc__ = ( + _base_doc = ( _before_kwargs + 'client : dask.distributed.Client or None, optional (default=None)\n' + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n' + ' ' * 8 + _kwargs + _after_kwargs ) + # the note on custom objective functions in LGBMModel.__init__ is not + # currently relevant for the Dask estimators + __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')] + def __getstate__(self) -> Dict[Any, Any]: return self._lgb_getstate() @@ -604,7 +615,23 @@ def fit( **kwargs ) - fit.__doc__ = LGBMClassifier.fit.__doc__ + _base_doc = _lgbmmodel_doc_fit.format( + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", + sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", + group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" + ) + + # DaskLGBMClassifier does not support init_score, evaluation data, or early stopping + _base_doc = (_base_doc[:_base_doc.find('init_score :')] + + _base_doc[_base_doc.find('verbose :'):]) + + # DaskLGBMClassifier support for callbacks and init_model is not tested + fit.__doc__ = ( + _base_doc[:_base_doc.find('callbacks :')] + + '**kwargs\n' + + ' ' * 12 + 'Other parameters passed through to ``LGBMClassifier.fit()``\n' + ) def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMClassifier.predict.""" @@ -615,7 +642,14 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: **kwargs ) - predict.__doc__ = LGBMClassifier.predict.__doc__ + predict.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted value for each sample.", + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + output_name="predicted_result", + predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" + ) def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMClassifier.predict_proba.""" @@ -626,7 +660,14 @@ def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: **kwargs ) - predict_proba.__doc__ = LGBMClassifier.predict_proba.__doc__ + predict_proba.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted probability for each class for each sample.", + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + output_name="predicted_probability", + predicted_result_shape="Dask Array of shape = [n_samples, n_classes]", + X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]" + ) def to_local(self) -> LGBMClassifier: """Create regular version of lightgbm.LGBMClassifier from the distributed version. @@ -695,13 +736,17 @@ def __init__( _base_doc = LGBMRegressor.__init__.__doc__ _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') - __init__.__doc__ = ( + _base_doc = ( _before_kwargs + 'client : dask.distributed.Client or None, optional (default=None)\n' + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n' + ' ' * 8 + _kwargs + _after_kwargs ) + # the note on custom objective functions in LGBMModel.__init__ is not + # currently relevant for the Dask estimators + __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')] + def __getstate__(self) -> Dict[Any, Any]: return self._lgb_getstate() @@ -721,7 +766,23 @@ def fit( **kwargs ) - fit.__doc__ = LGBMRegressor.fit.__doc__ + _base_doc = _lgbmmodel_doc_fit.format( + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", + sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", + group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" + ) + + # DaskLGBMRegressor does not support init_score, evaluation data, or early stopping + _base_doc = (_base_doc[:_base_doc.find('init_score :')] + + _base_doc[_base_doc.find('verbose :'):]) + + # DaskLGBMRegressor support for callbacks and init_model is not tested + fit.__doc__ = ( + _base_doc[:_base_doc.find('callbacks :')] + + '**kwargs\n' + + ' ' * 12 + 'Other parameters passed through to ``LGBMRegressor.fit()``\n' + ) def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMRegressor.predict.""" @@ -731,7 +792,14 @@ def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array: **kwargs ) - predict.__doc__ = LGBMRegressor.predict.__doc__ + predict.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted value for each sample.", + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + output_name="predicted_result", + predicted_result_shape="Dask Array of shape = [n_samples]", + X_leaves_shape="Dask Array of shape = [n_samples, n_trees]", + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]" + ) def to_local(self) -> LGBMRegressor: """Create regular version of lightgbm.LGBMRegressor from the distributed version. @@ -800,13 +868,17 @@ def __init__( _base_doc = LGBMRanker.__init__.__doc__ _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') - __init__.__doc__ = ( + _base_doc = ( _before_kwargs + 'client : dask.distributed.Client or None, optional (default=None)\n' + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n' + ' ' * 8 + _kwargs + _after_kwargs ) + # the note on custom objective functions in LGBMModel.__init__ is not + # currently relevant for the Dask estimators + __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')] + def __getstate__(self) -> Dict[Any, Any]: return self._lgb_getstate() @@ -832,13 +904,39 @@ def fit( **kwargs ) - fit.__doc__ = LGBMRanker.fit.__doc__ + _base_doc = _lgbmmodel_doc_fit.format( + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", + sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)", + group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)" + ) + + # DaskLGBMRanker does not support init_score, evaluation data, or early stopping + _base_doc = (_base_doc[:_base_doc.find('init_score :')] + + _base_doc[_base_doc.find('init_score :'):]) + + _base_doc = (_base_doc[:_base_doc.find('eval_set :')] + + _base_doc[_base_doc.find('verbose :'):]) + + # DaskLGBMRanker support for callbacks and init_model is not tested + fit.__doc__ = ( + _base_doc[:_base_doc.find('callbacks :')] + + '**kwargs\n' + + ' ' * 12 + 'Other parameters passed through to ``LGBMRanker.fit()``\n' + ) def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMRanker.predict.""" return _predict(self.to_local(), X, **kwargs) - predict.__doc__ = LGBMRanker.predict.__doc__ + predict.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted value for each sample.", + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + output_name="predicted_result", + predicted_result_shape="Dask Array of shape = [n_samples]", + X_leaves_shape="Dask Array of shape = [n_samples, n_trees]", + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]" + ) def to_local(self) -> LGBMRanker: """Create regular version of lightgbm.LGBMRanker from the distributed version. diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 6a48f2c7b7d5..ce5abe24f8d4 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -176,6 +176,170 @@ def __call__(self, preds, dataset): raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc) +# documentation templates for LGBMModel methods are shared between the classes in +# this module and those in the ``dask`` module + +_lgbmmodel_doc_fit = ( + """ + Build a gradient boosting model from the training set (X, y). + + Parameters + ---------- + X : {X_shape} + Input feature matrix. + y : {y_shape} + The target values (class labels in classification, real numbers in regression). + sample_weight : {sample_weight_shape} + Weights of training data. + init_score : array-like of shape = [n_samples] or None, optional (default=None) + Init score of training data. + group : {group_shape} + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + eval_set : list or None, optional (default=None) + A list of (X, y) tuple pairs to use as validation sets. + eval_names : list of strings or None, optional (default=None) + Names of eval_set. + eval_sample_weight : list of arrays or None, optional (default=None) + Weights of eval data. + eval_class_weight : list or None, optional (default=None) + Class weights of eval data. + eval_init_score : list of arrays or None, optional (default=None) + Init score of eval data. + eval_group : list of arrays or None, optional (default=None) + Group data of eval data. + eval_metric : string, callable, list or None, optional (default=None) + If string, it should be a built-in evaluation metric to use. + If callable, it should be a custom evaluation metric, see note below for more details. + If list, it can be a list of built-in metrics, a list of custom evaluation metrics, or a mix of both. + In either case, the ``metric`` from the model parameters will be evaluated and used as well. + Default: 'l2' for LGBMRegressor, 'logloss' for LGBMClassifier, 'ndcg' for LGBMRanker. + early_stopping_rounds : int or None, optional (default=None) + Activates early stopping. The model will train until the validation score stops improving. + Validation score needs to improve at least every ``early_stopping_rounds`` round(s) + to continue training. + Requires at least one validation data and one metric. + If there's more than one, will check all of them. But the training data is ignored anyway. + To check only the first metric, set the ``first_metric_only`` parameter to ``True`` + in additional parameters ``**kwargs`` of the model constructor. + verbose : bool or int, optional (default=True) + Requires at least one evaluation data. + If True, the eval metric on the eval set is printed at each boosting stage. + If int, the eval metric on the eval set is printed at every ``verbose`` boosting stage. + The last boosting stage or the boosting stage found by using ``early_stopping_rounds`` is also printed. + + .. rubric:: Example + + With ``verbose`` = 4 and at least one item in ``eval_set``, + an evaluation metric is printed every 4 (instead of 1) boosting stages. + + feature_name : list of strings or 'auto', optional (default='auto') + Feature names. + If 'auto' and data is pandas DataFrame, data columns names are used. + categorical_feature : list of strings or int, or 'auto', optional (default='auto') + Categorical features. + If list of int, interpreted as indices. + If list of strings, interpreted as feature names (need to specify ``feature_name`` as well). + If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. + All values in categorical features should be less than int32 max value (2147483647). + Large values could be memory consuming. Consider using consecutive integers starting from zero. + All negative values in categorical features will be treated as missing values. + The output cannot be monotonically constrained with respect to a categorical feature. + callbacks : list of callback functions or None, optional (default=None) + List of callback functions that are applied at each iteration. + See Callbacks in Python API for more information. + init_model : string, Booster, LGBMModel or None, optional (default=None) + Filename of LightGBM model, Booster instance or LGBMModel instance used for continue training. + + Returns + ------- + self : object + Returns self. + """ +) + +_lgbmmodel_doc_custom_eval_note = """ + Note + ---- + Custom eval function expects a callable with following signatures: + ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or + ``func(y_true, y_pred, weight, group)`` + and returns (eval_name, eval_result, is_higher_better) or + list of (eval_name, eval_result, is_higher_better): + + y_true : array-like of shape = [n_samples] + The target values. + y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) + The predicted values. + weight : array-like of shape = [n_samples] + The weight of samples. + group : array-like + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + eval_name : string + The name of evaluation function (without whitespaces). + eval_result : float + The eval result. + is_higher_better : bool + Is eval result higher better, e.g. AUC is ``is_higher_better``. + + For binary task, the y_pred is probability of positive class (or margin in case of custom ``objective``). + For multi-class task, the y_pred is group by class_id first, then group by row_id. + If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]. +""" + +_lgbmmodel_doc_predict = ( + """ + {description} + + Parameters + ---------- + X : {X_shape} + Input features matrix. + raw_score : bool, optional (default=False) + Whether to predict raw scores. + start_iteration : int, optional (default=0) + Start index of the iteration to predict. + If <= 0, starts from the first iteration. + num_iteration : int or None, optional (default=None) + Total number of iterations used in the prediction. + If None, if the best iteration exists and start_iteration <= 0, the best iteration is used; + otherwise, all iterations from ``start_iteration`` are used (no limits). + If <= 0, all iterations from ``start_iteration`` are used (no limits). + pred_leaf : bool, optional (default=False) + Whether to predict leaf index. + pred_contrib : bool, optional (default=False) + Whether to predict feature contributions. + + .. note:: + + If you want to get more explanations for your model's predictions using SHAP values, + like SHAP interaction values, + you can install the shap package (https://github.com/slundberg/shap). + Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra + column, where the last column is the expected value. + + **kwargs + Other parameters for the prediction. + + Returns + ------- + {output_name} : {predicted_result_shape} + The predicted values. + X_leaves : {X_leaves_shape} + If ``pred_leaf=True``, the predicted leaf of every tree for each sample. + X_SHAP_values : {X_SHAP_values_shape} + If ``pred_contrib=True``, the feature contributions for each sample. + """ +) + + class LGBMModel(_LGBMModelBase): """Implementation of the scikit-learn API for LightGBM.""" @@ -382,115 +546,7 @@ def fit(self, X, y, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None, init_model=None): - """Build a gradient boosting model from the training set (X, y). - - Parameters - ---------- - X : array-like or sparse matrix of shape = [n_samples, n_features] - Input feature matrix. - y : array-like of shape = [n_samples] - The target values (class labels in classification, real numbers in regression). - sample_weight : array-like of shape = [n_samples] or None, optional (default=None) - Weights of training data. - init_score : array-like of shape = [n_samples] or None, optional (default=None) - Init score of training data. - group : array-like or None, optional (default=None) - Group/query data. - Only used in the learning-to-rank task. - sum(group) = n_samples. - For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, - where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - eval_set : list or None, optional (default=None) - A list of (X, y) tuple pairs to use as validation sets. - eval_names : list of strings or None, optional (default=None) - Names of eval_set. - eval_sample_weight : list of arrays or None, optional (default=None) - Weights of eval data. - eval_class_weight : list or None, optional (default=None) - Class weights of eval data. - eval_init_score : list of arrays or None, optional (default=None) - Init score of eval data. - eval_group : list of arrays or None, optional (default=None) - Group data of eval data. - eval_metric : string, callable, list or None, optional (default=None) - If string, it should be a built-in evaluation metric to use. - If callable, it should be a custom evaluation metric, see note below for more details. - If list, it can be a list of built-in metrics, a list of custom evaluation metrics, or a mix of both. - In either case, the ``metric`` from the model parameters will be evaluated and used as well. - Default: 'l2' for LGBMRegressor, 'logloss' for LGBMClassifier, 'ndcg' for LGBMRanker. - early_stopping_rounds : int or None, optional (default=None) - Activates early stopping. The model will train until the validation score stops improving. - Validation score needs to improve at least every ``early_stopping_rounds`` round(s) - to continue training. - Requires at least one validation data and one metric. - If there's more than one, will check all of them. But the training data is ignored anyway. - To check only the first metric, set the ``first_metric_only`` parameter to ``True`` - in additional parameters ``**kwargs`` of the model constructor. - verbose : bool or int, optional (default=True) - Requires at least one evaluation data. - If True, the eval metric on the eval set is printed at each boosting stage. - If int, the eval metric on the eval set is printed at every ``verbose`` boosting stage. - The last boosting stage or the boosting stage found by using ``early_stopping_rounds`` is also printed. - - .. rubric:: Example - - With ``verbose`` = 4 and at least one item in ``eval_set``, - an evaluation metric is printed every 4 (instead of 1) boosting stages. - - feature_name : list of strings or 'auto', optional (default='auto') - Feature names. - If 'auto' and data is pandas DataFrame, data columns names are used. - categorical_feature : list of strings or int, or 'auto', optional (default='auto') - Categorical features. - If list of int, interpreted as indices. - If list of strings, interpreted as feature names (need to specify ``feature_name`` as well). - If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. - All values in categorical features should be less than int32 max value (2147483647). - Large values could be memory consuming. Consider using consecutive integers starting from zero. - All negative values in categorical features will be treated as missing values. - The output cannot be monotonically constrained with respect to a categorical feature. - callbacks : list of callback functions or None, optional (default=None) - List of callback functions that are applied at each iteration. - See Callbacks in Python API for more information. - init_model : string, Booster, LGBMModel or None, optional (default=None) - Filename of LightGBM model, Booster instance or LGBMModel instance used for continue training. - - Returns - ------- - self : object - Returns self. - - Note - ---- - Custom eval function expects a callable with following signatures: - ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or - ``func(y_true, y_pred, weight, group)`` - and returns (eval_name, eval_result, is_higher_better) or - list of (eval_name, eval_result, is_higher_better): - - y_true : array-like of shape = [n_samples] - The target values. - y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) - The predicted values. - weight : array-like of shape = [n_samples] - The weight of samples. - group : array-like - Group/query data. - Only used in the learning-to-rank task. - sum(group) = n_samples. - For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, - where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - eval_name : string - The name of evaluation function (without whitespaces). - eval_result : float - The eval result. - is_higher_better : bool - Is eval result higher better, e.g. AUC is ``is_higher_better``. - - For binary task, the y_pred is probability of positive class (or margin in case of custom ``objective``). - For multi-class task, the y_pred is group by class_id first, then group by row_id. - If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]. - """ + """Docstring is set after definition, using a template.""" if self._objective is None: if isinstance(self, LGBMRegressor): self._objective = "regression" @@ -648,49 +704,16 @@ def _get_meta_data(collection, name, i): del train_set, valid_sets return self + fit.__doc__ = _lgbmmodel_doc_fit.format( + X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", + y_shape="array-like of shape = [n_samples]", + sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)", + group_shape="array-like or None, optional (default=None)" + ) + "\n\n" + _lgbmmodel_doc_custom_eval_note + def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None, pred_leaf=False, pred_contrib=False, **kwargs): - """Return the predicted value for each sample. - - Parameters - ---------- - X : array-like or sparse matrix of shape = [n_samples, n_features] - Input features matrix. - raw_score : bool, optional (default=False) - Whether to predict raw scores. - start_iteration : int, optional (default=0) - Start index of the iteration to predict. - If <= 0, starts from the first iteration. - num_iteration : int or None, optional (default=None) - Total number of iterations used in the prediction. - If None, if the best iteration exists and start_iteration <= 0, the best iteration is used; - otherwise, all iterations from ``start_iteration`` are used (no limits). - If <= 0, all iterations from ``start_iteration`` are used (no limits). - pred_leaf : bool, optional (default=False) - Whether to predict leaf index. - pred_contrib : bool, optional (default=False) - Whether to predict feature contributions. - - .. note:: - - If you want to get more explanations for your model's predictions using SHAP values, - like SHAP interaction values, - you can install the shap package (https://github.com/slundberg/shap). - Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra - column, where the last column is the expected value. - - **kwargs - Other parameters for the prediction. - - Returns - ------- - predicted_result : array-like of shape = [n_samples] or shape = [n_samples, n_classes] - The predicted values. - X_leaves : array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes] - If ``pred_leaf=True``, the predicted leaf of every tree for each sample. - X_SHAP_values : array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects - If ``pred_contrib=True``, the feature contributions for each sample. - """ + """Docstring is set after definition, using a template.""" if self._n_features is None: raise LGBMNotFittedError("Estimator not fitted, call `fit` before exploiting the model.") if not isinstance(X, (pd_DataFrame, dt_DataTable)): @@ -704,6 +727,15 @@ def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None, return self._Booster.predict(X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration, pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs) + predict.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted value for each sample.", + X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", + output_name="predicted_result", + predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects" + ) + @property def n_features_(self): """:obj:`int`: The number of features of fitted model.""" @@ -885,47 +917,7 @@ def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None, def predict_proba(self, X, raw_score=False, start_iteration=0, num_iteration=None, pred_leaf=False, pred_contrib=False, **kwargs): - """Return the predicted probability for each class for each sample. - - Parameters - ---------- - X : array-like or sparse matrix of shape = [n_samples, n_features] - Input features matrix. - raw_score : bool, optional (default=False) - Whether to predict raw scores. - start_iteration : int, optional (default=0) - Start index of the iteration to predict. - If <= 0, starts from the first iteration. - num_iteration : int or None, optional (default=None) - Total number of iterations used in the prediction. - If None, if the best iteration exists and start_iteration <= 0, the best iteration is used; - otherwise, all iterations from ``start_iteration`` are used (no limits). - If <= 0, all iterations from ``start_iteration`` are used (no limits). - pred_leaf : bool, optional (default=False) - Whether to predict leaf index. - pred_contrib : bool, optional (default=False) - Whether to predict feature contributions. - - .. note:: - - If you want to get more explanations for your model's predictions using SHAP values, - like SHAP interaction values, - you can install the shap package (https://github.com/slundberg/shap). - Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra - column, where the last column is the expected value. - - **kwargs - Other parameters for the prediction. - - Returns - ------- - predicted_probability : array-like of shape = [n_samples, n_classes] - The predicted probability for each class for each sample. - X_leaves : array-like of shape = [n_samples, n_trees * n_classes] - If ``pred_leaf=True``, the predicted leaf of every tree for each sample. - X_SHAP_values : array-like of shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects - If ``pred_contrib=True``, the feature contributions for each sample. - """ + """Docstring is set after definition, using a template.""" result = super().predict(X, raw_score, start_iteration, num_iteration, pred_leaf, pred_contrib, **kwargs) if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib): _log_warning("Cannot compute class probabilities or labels " @@ -937,6 +929,15 @@ def predict_proba(self, X, raw_score=False, start_iteration=0, num_iteration=Non else: return np.vstack((1. - result, result)).transpose() + predict_proba.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted probability for each class for each sample.", + X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", + output_name="predicted_probability", + predicted_result_shape="array-like of shape = [n_samples, n_classes]", + X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects" + ) + @property def classes_(self): """:obj:`array` of shape = [n_classes]: The class label array.""" diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index c441466b3151..6ed56e87f800 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -575,7 +575,7 @@ def test_ranker(output, client, listen_port, group): group=group, ) - # rebalance small dask.array dataset for better performance. + # rebalance small dask.Array dataset for better performance. if output == 'array': dX = dX.persist() dy = dy.persist() @@ -584,7 +584,7 @@ def test_ranker(output, client, listen_port, group): _ = wait([dX, dy, dw, dg]) client.rebalance() - # use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of + # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210. params = { "random_state": 42,