diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index a62f552b60dd..b381789e37d3 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -1,8 +1,8 @@
 # coding: utf-8
-"""Distributed training with LightGBM and Dask.distributed.
+"""Distributed training with LightGBM and dask.distributed.
 
 This module enables you to perform distributed training with LightGBM on
-Dask.Array and Dask.DataFrame collections.
+dask.Array and dask.DataFrame collections.
 
 It is based on dask-lightgbm, which was based on dask-xgboost.
 """
@@ -19,7 +19,14 @@
 from .compat import (PANDAS_INSTALLED, pd_DataFrame, pd_Series, concat,
                      SKLEARN_INSTALLED, LGBMNotFittedError,
                      DASK_INSTALLED, dask_DataFrame, dask_Array, dask_Series, delayed, Client, default_client, get_worker, wait)
-from .sklearn import LGBMClassifier, LGBMModel, LGBMRegressor, LGBMRanker
+from .sklearn import (
+    _lgbmmodel_doc_fit,
+    _lgbmmodel_doc_predict,
+    LGBMClassifier,
+    LGBMModel,
+    LGBMRegressor,
+    LGBMRanker
+)
 
 _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series]
 _DaskMatrixLike = Union[dask_Array, dask_DataFrame]
@@ -216,17 +223,17 @@ def _train(
     ----------
     client : dask.distributed.Client
         Dask client.
-    data : dask Array or dask DataFrame of shape = [n_samples, n_features]
+    data : Dask Array or Dask DataFrame of shape = [n_samples, n_features]
         Input feature matrix.
-    label : dask Array, dask DataFrame or dask Series of shape = [n_samples]
+    label : Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]
         The target values (class labels in classification, real numbers in regression).
     params : dict
         Parameters passed to constructor of the local underlying model.
     model_factory : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
         Class of the local underlying model.
-    sample_weight : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
+    sample_weight : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
         Weights of training data.
-    group : dask Array, dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
+    group : Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)
         Group/query data.
         Only used in the learning-to-rank task.
         sum(group) = n_samples.
@@ -396,7 +403,7 @@ def _predict(
     ----------
     model : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
         Fitted underlying model.
-    data : dask Array or dask DataFrame of shape = [n_samples, n_features]
+    data : Dask Array or Dask DataFrame of shape = [n_samples, n_features]
         Input feature matrix.
     raw_score : bool, optional (default=False)
         Whether to predict raw scores.
@@ -413,11 +420,11 @@ def _predict(
 
     Returns
     -------
-    predicted_result : dask Array of shape = [n_samples] or shape = [n_samples, n_classes]
+    predicted_result : Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]
         The predicted values.
-    X_leaves : dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
+    X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
         If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
-    X_SHAP_values : dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
+    X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
         If ``pred_contrib=True``, the feature contributions for each sample.
     """
     if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):
@@ -448,7 +455,7 @@ def _predict(
             **kwargs
         )
     else:
-        raise TypeError('Data must be either dask Array or dask DataFrame. Got %s.' % str(type(data)))
+        raise TypeError('Data must be either Dask Array or Dask DataFrame. Got %s.' % str(type(data)))
 
 
 class _DaskLGBMModel:
@@ -578,13 +585,17 @@ def __init__(
 
     _base_doc = LGBMClassifier.__init__.__doc__
     _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
-    __init__.__doc__ = (
+    _base_doc = (
         _before_kwargs
         + 'client : dask.distributed.Client or None, optional (default=None)\n'
         + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
         + ' ' * 8 + _kwargs + _after_kwargs
     )
 
+    # the note on custom objective functions in LGBMModel.__init__ is not
+    # currently relevant for the Dask estimators
+    __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
+
     def __getstate__(self) -> Dict[Any, Any]:
         return self._lgb_getstate()
 
@@ -604,7 +615,23 @@ def fit(
             **kwargs
         )
 
-    fit.__doc__ = LGBMClassifier.fit.__doc__
+    _base_doc = _lgbmmodel_doc_fit.format(
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
+        sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
+        group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
+    )
+
+    # DaskLGBMClassifier does not support init_score, evaluation data, or early stopping
+    _base_doc = (_base_doc[:_base_doc.find('init_score :')]
+                 + _base_doc[_base_doc.find('verbose :'):])
+
+    # DaskLGBMClassifier support for callbacks and init_model is not tested
+    fit.__doc__ = (
+        _base_doc[:_base_doc.find('callbacks :')]
+        + '**kwargs\n'
+        + ' ' * 12 + 'Other parameters passed through to ``LGBMClassifier.fit()``\n'
+    )
 
     def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
         """Docstring is inherited from the lightgbm.LGBMClassifier.predict."""
@@ -615,7 +642,14 @@ def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
             **kwargs
         )
 
-    predict.__doc__ = LGBMClassifier.predict.__doc__
+    predict.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted value for each sample.",
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        output_name="predicted_result",
+        predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]",
+        X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
+    )
 
     def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
         """Docstring is inherited from the lightgbm.LGBMClassifier.predict_proba."""
@@ -626,7 +660,14 @@ def predict_proba(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
             **kwargs
         )
 
-    predict_proba.__doc__ = LGBMClassifier.predict_proba.__doc__
+    predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted probability for each class for each sample.",
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        output_name="predicted_probability",
+        predicted_result_shape="Dask Array of shape = [n_samples, n_classes]",
+        X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]"
+    )
 
     def to_local(self) -> LGBMClassifier:
         """Create regular version of lightgbm.LGBMClassifier from the distributed version.
@@ -695,13 +736,17 @@ def __init__(
 
     _base_doc = LGBMRegressor.__init__.__doc__
     _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
-    __init__.__doc__ = (
+    _base_doc = (
         _before_kwargs
         + 'client : dask.distributed.Client or None, optional (default=None)\n'
         + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
         + ' ' * 8 + _kwargs + _after_kwargs
     )
 
+    # the note on custom objective functions in LGBMModel.__init__ is not
+    # currently relevant for the Dask estimators
+    __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
+
     def __getstate__(self) -> Dict[Any, Any]:
         return self._lgb_getstate()
 
@@ -721,7 +766,23 @@ def fit(
             **kwargs
         )
 
-    fit.__doc__ = LGBMRegressor.fit.__doc__
+    _base_doc = _lgbmmodel_doc_fit.format(
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
+        sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
+        group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
+    )
+
+    # DaskLGBMRegressor does not support init_score, evaluation data, or early stopping
+    _base_doc = (_base_doc[:_base_doc.find('init_score :')]
+                 + _base_doc[_base_doc.find('verbose :'):])
+
+    # DaskLGBMRegressor support for callbacks and init_model is not tested
+    fit.__doc__ = (
+        _base_doc[:_base_doc.find('callbacks :')]
+        + '**kwargs\n'
+        + ' ' * 12 + 'Other parameters passed through to ``LGBMRegressor.fit()``\n'
+    )
 
     def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array:
         """Docstring is inherited from the lightgbm.LGBMRegressor.predict."""
@@ -731,7 +792,14 @@ def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array:
             **kwargs
         )
 
-    predict.__doc__ = LGBMRegressor.predict.__doc__
+    predict.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted value for each sample.",
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        output_name="predicted_result",
+        predicted_result_shape="Dask Array of shape = [n_samples]",
+        X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
+    )
 
     def to_local(self) -> LGBMRegressor:
         """Create regular version of lightgbm.LGBMRegressor from the distributed version.
@@ -800,13 +868,17 @@ def __init__(
 
     _base_doc = LGBMRanker.__init__.__doc__
     _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
-    __init__.__doc__ = (
+    _base_doc = (
         _before_kwargs
         + 'client : dask.distributed.Client or None, optional (default=None)\n'
         + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.\n'
         + ' ' * 8 + _kwargs + _after_kwargs
     )
 
+    # the note on custom objective functions in LGBMModel.__init__ is not
+    # currently relevant for the Dask estimators
+    __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
+
     def __getstate__(self) -> Dict[Any, Any]:
         return self._lgb_getstate()
 
@@ -832,13 +904,39 @@ def fit(
             **kwargs
         )
 
-    fit.__doc__ = LGBMRanker.fit.__doc__
+    _base_doc = _lgbmmodel_doc_fit.format(
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
+        sample_weight_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)",
+        group_shape="Dask Array, Dask DataFrame, Dask Series of shape = [n_samples] or None, optional (default=None)"
+    )
+
+    # DaskLGBMRanker does not support init_score, evaluation data, or early stopping
+    _base_doc = (_base_doc[:_base_doc.find('init_score :')]
+                 + _base_doc[_base_doc.find('init_score :'):])
+
+    _base_doc = (_base_doc[:_base_doc.find('eval_set :')]
+                 + _base_doc[_base_doc.find('verbose :'):])
+
+    # DaskLGBMRanker support for callbacks and init_model is not tested
+    fit.__doc__ = (
+        _base_doc[:_base_doc.find('callbacks :')]
+        + '**kwargs\n'
+        + ' ' * 12 + 'Other parameters passed through to ``LGBMRanker.fit()``\n'
+    )
 
     def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
         """Docstring is inherited from the lightgbm.LGBMRanker.predict."""
         return _predict(self.to_local(), X, **kwargs)
 
-    predict.__doc__ = LGBMRanker.predict.__doc__
+    predict.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted value for each sample.",
+        X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
+        output_name="predicted_result",
+        predicted_result_shape="Dask Array of shape = [n_samples]",
+        X_leaves_shape="Dask Array of shape = [n_samples, n_trees]",
+        X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]"
+    )
 
     def to_local(self) -> LGBMRanker:
         """Create regular version of lightgbm.LGBMRanker from the distributed version.
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 6a48f2c7b7d5..ce5abe24f8d4 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -176,6 +176,170 @@ def __call__(self, preds, dataset):
             raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc)
 
 
+# documentation templates for LGBMModel methods are shared between the classes in
+# this module and those in the ``dask`` module
+
+_lgbmmodel_doc_fit = (
+    """
+    Build a gradient boosting model from the training set (X, y).
+
+    Parameters
+    ----------
+    X : {X_shape}
+        Input feature matrix.
+    y : {y_shape}
+        The target values (class labels in classification, real numbers in regression).
+    sample_weight : {sample_weight_shape}
+        Weights of training data.
+    init_score : array-like of shape = [n_samples] or None, optional (default=None)
+        Init score of training data.
+    group : {group_shape}
+        Group/query data.
+        Only used in the learning-to-rank task.
+        sum(group) = n_samples.
+        For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+        where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+    eval_set : list or None, optional (default=None)
+        A list of (X, y) tuple pairs to use as validation sets.
+    eval_names : list of strings or None, optional (default=None)
+        Names of eval_set.
+    eval_sample_weight : list of arrays or None, optional (default=None)
+        Weights of eval data.
+    eval_class_weight : list or None, optional (default=None)
+        Class weights of eval data.
+    eval_init_score : list of arrays or None, optional (default=None)
+        Init score of eval data.
+    eval_group : list of arrays or None, optional (default=None)
+        Group data of eval data.
+    eval_metric : string, callable, list or None, optional (default=None)
+        If string, it should be a built-in evaluation metric to use.
+        If callable, it should be a custom evaluation metric, see note below for more details.
+        If list, it can be a list of built-in metrics, a list of custom evaluation metrics, or a mix of both.
+        In either case, the ``metric`` from the model parameters will be evaluated and used as well.
+        Default: 'l2' for LGBMRegressor, 'logloss' for LGBMClassifier, 'ndcg' for LGBMRanker.
+    early_stopping_rounds : int or None, optional (default=None)
+        Activates early stopping. The model will train until the validation score stops improving.
+        Validation score needs to improve at least every ``early_stopping_rounds`` round(s)
+        to continue training.
+        Requires at least one validation data and one metric.
+        If there's more than one, will check all of them. But the training data is ignored anyway.
+        To check only the first metric, set the ``first_metric_only`` parameter to ``True``
+        in additional parameters ``**kwargs`` of the model constructor.
+    verbose : bool or int, optional (default=True)
+        Requires at least one evaluation data.
+        If True, the eval metric on the eval set is printed at each boosting stage.
+        If int, the eval metric on the eval set is printed at every ``verbose`` boosting stage.
+        The last boosting stage or the boosting stage found by using ``early_stopping_rounds`` is also printed.
+
+        .. rubric:: Example
+
+        With ``verbose`` = 4 and at least one item in ``eval_set``,
+        an evaluation metric is printed every 4 (instead of 1) boosting stages.
+
+    feature_name : list of strings or 'auto', optional (default='auto')
+        Feature names.
+        If 'auto' and data is pandas DataFrame, data columns names are used.
+    categorical_feature : list of strings or int, or 'auto', optional (default='auto')
+        Categorical features.
+        If list of int, interpreted as indices.
+        If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
+        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
+        All values in categorical features should be less than int32 max value (2147483647).
+        Large values could be memory consuming. Consider using consecutive integers starting from zero.
+        All negative values in categorical features will be treated as missing values.
+        The output cannot be monotonically constrained with respect to a categorical feature.
+    callbacks : list of callback functions or None, optional (default=None)
+        List of callback functions that are applied at each iteration.
+        See Callbacks in Python API for more information.
+    init_model : string, Booster, LGBMModel or None, optional (default=None)
+        Filename of LightGBM model, Booster instance or LGBMModel instance used for continue training.
+
+    Returns
+    -------
+    self : object
+        Returns self.
+    """
+)
+
+_lgbmmodel_doc_custom_eval_note = """
+    Note
+    ----
+    Custom eval function expects a callable with following signatures:
+    ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or
+    ``func(y_true, y_pred, weight, group)``
+    and returns (eval_name, eval_result, is_higher_better) or
+    list of (eval_name, eval_result, is_higher_better):
+
+        y_true : array-like of shape = [n_samples]
+            The target values.
+        y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+            The predicted values.
+        weight : array-like of shape = [n_samples]
+            The weight of samples.
+        group : array-like
+            Group/query data.
+            Only used in the learning-to-rank task.
+            sum(group) = n_samples.
+            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+            where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+        eval_name : string
+            The name of evaluation function (without whitespaces).
+        eval_result : float
+            The eval result.
+        is_higher_better : bool
+            Is eval result higher better, e.g. AUC is ``is_higher_better``.
+
+    For binary task, the y_pred is probability of positive class (or margin in case of custom ``objective``).
+    For multi-class task, the y_pred is group by class_id first, then group by row_id.
+    If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
+"""
+
+_lgbmmodel_doc_predict = (
+    """
+    {description}
+
+    Parameters
+    ----------
+    X : {X_shape}
+        Input features matrix.
+    raw_score : bool, optional (default=False)
+        Whether to predict raw scores.
+    start_iteration : int, optional (default=0)
+        Start index of the iteration to predict.
+        If <= 0, starts from the first iteration.
+    num_iteration : int or None, optional (default=None)
+        Total number of iterations used in the prediction.
+        If None, if the best iteration exists and start_iteration <= 0, the best iteration is used;
+        otherwise, all iterations from ``start_iteration`` are used (no limits).
+        If <= 0, all iterations from ``start_iteration`` are used (no limits).
+    pred_leaf : bool, optional (default=False)
+        Whether to predict leaf index.
+    pred_contrib : bool, optional (default=False)
+        Whether to predict feature contributions.
+
+        .. note::
+
+            If you want to get more explanations for your model's predictions using SHAP values,
+            like SHAP interaction values,
+            you can install the shap package (https://github.com/slundberg/shap).
+            Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra
+            column, where the last column is the expected value.
+
+    **kwargs
+        Other parameters for the prediction.
+
+    Returns
+    -------
+    {output_name} : {predicted_result_shape}
+        The predicted values.
+    X_leaves : {X_leaves_shape}
+        If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
+    X_SHAP_values : {X_SHAP_values_shape}
+        If ``pred_contrib=True``, the feature contributions for each sample.
+    """
+)
+
+
 class LGBMModel(_LGBMModelBase):
     """Implementation of the scikit-learn API for LightGBM."""
 
@@ -382,115 +546,7 @@ def fit(self, X, y,
             eval_metric=None, early_stopping_rounds=None, verbose=True,
             feature_name='auto', categorical_feature='auto',
             callbacks=None, init_model=None):
-        """Build a gradient boosting model from the training set (X, y).
-
-        Parameters
-        ----------
-        X : array-like or sparse matrix of shape = [n_samples, n_features]
-            Input feature matrix.
-        y : array-like of shape = [n_samples]
-            The target values (class labels in classification, real numbers in regression).
-        sample_weight : array-like of shape = [n_samples] or None, optional (default=None)
-            Weights of training data.
-        init_score : array-like of shape = [n_samples] or None, optional (default=None)
-            Init score of training data.
-        group : array-like or None, optional (default=None)
-            Group/query data.
-            Only used in the learning-to-rank task.
-            sum(group) = n_samples.
-            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
-            where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
-        eval_set : list or None, optional (default=None)
-            A list of (X, y) tuple pairs to use as validation sets.
-        eval_names : list of strings or None, optional (default=None)
-            Names of eval_set.
-        eval_sample_weight : list of arrays or None, optional (default=None)
-            Weights of eval data.
-        eval_class_weight : list or None, optional (default=None)
-            Class weights of eval data.
-        eval_init_score : list of arrays or None, optional (default=None)
-            Init score of eval data.
-        eval_group : list of arrays or None, optional (default=None)
-            Group data of eval data.
-        eval_metric : string, callable, list or None, optional (default=None)
-            If string, it should be a built-in evaluation metric to use.
-            If callable, it should be a custom evaluation metric, see note below for more details.
-            If list, it can be a list of built-in metrics, a list of custom evaluation metrics, or a mix of both.
-            In either case, the ``metric`` from the model parameters will be evaluated and used as well.
-            Default: 'l2' for LGBMRegressor, 'logloss' for LGBMClassifier, 'ndcg' for LGBMRanker.
-        early_stopping_rounds : int or None, optional (default=None)
-            Activates early stopping. The model will train until the validation score stops improving.
-            Validation score needs to improve at least every ``early_stopping_rounds`` round(s)
-            to continue training.
-            Requires at least one validation data and one metric.
-            If there's more than one, will check all of them. But the training data is ignored anyway.
-            To check only the first metric, set the ``first_metric_only`` parameter to ``True``
-            in additional parameters ``**kwargs`` of the model constructor.
-        verbose : bool or int, optional (default=True)
-            Requires at least one evaluation data.
-            If True, the eval metric on the eval set is printed at each boosting stage.
-            If int, the eval metric on the eval set is printed at every ``verbose`` boosting stage.
-            The last boosting stage or the boosting stage found by using ``early_stopping_rounds`` is also printed.
-
-            .. rubric:: Example
-
-            With ``verbose`` = 4 and at least one item in ``eval_set``,
-            an evaluation metric is printed every 4 (instead of 1) boosting stages.
-
-        feature_name : list of strings or 'auto', optional (default='auto')
-            Feature names.
-            If 'auto' and data is pandas DataFrame, data columns names are used.
-        categorical_feature : list of strings or int, or 'auto', optional (default='auto')
-            Categorical features.
-            If list of int, interpreted as indices.
-            If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
-            If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
-            All values in categorical features should be less than int32 max value (2147483647).
-            Large values could be memory consuming. Consider using consecutive integers starting from zero.
-            All negative values in categorical features will be treated as missing values.
-            The output cannot be monotonically constrained with respect to a categorical feature.
-        callbacks : list of callback functions or None, optional (default=None)
-            List of callback functions that are applied at each iteration.
-            See Callbacks in Python API for more information.
-        init_model : string, Booster, LGBMModel or None, optional (default=None)
-            Filename of LightGBM model, Booster instance or LGBMModel instance used for continue training.
-
-        Returns
-        -------
-        self : object
-            Returns self.
-
-        Note
-        ----
-        Custom eval function expects a callable with following signatures:
-        ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or
-        ``func(y_true, y_pred, weight, group)``
-        and returns (eval_name, eval_result, is_higher_better) or
-        list of (eval_name, eval_result, is_higher_better):
-
-            y_true : array-like of shape = [n_samples]
-                The target values.
-            y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
-                The predicted values.
-            weight : array-like of shape = [n_samples]
-                The weight of samples.
-            group : array-like
-                Group/query data.
-                Only used in the learning-to-rank task.
-                sum(group) = n_samples.
-                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
-                where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
-            eval_name : string
-                The name of evaluation function (without whitespaces).
-            eval_result : float
-                The eval result.
-            is_higher_better : bool
-                Is eval result higher better, e.g. AUC is ``is_higher_better``.
-
-        For binary task, the y_pred is probability of positive class (or margin in case of custom ``objective``).
-        For multi-class task, the y_pred is group by class_id first, then group by row_id.
-        If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
-        """
+        """Docstring is set after definition, using a template."""
         if self._objective is None:
             if isinstance(self, LGBMRegressor):
                 self._objective = "regression"
@@ -648,49 +704,16 @@ def _get_meta_data(collection, name, i):
         del train_set, valid_sets
         return self
 
+    fit.__doc__ = _lgbmmodel_doc_fit.format(
+        X_shape="array-like or sparse matrix of shape = [n_samples, n_features]",
+        y_shape="array-like of shape = [n_samples]",
+        sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)",
+        group_shape="array-like or None, optional (default=None)"
+    ) + "\n\n" + _lgbmmodel_doc_custom_eval_note
+
     def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None,
                 pred_leaf=False, pred_contrib=False, **kwargs):
-        """Return the predicted value for each sample.
-
-        Parameters
-        ----------
-        X : array-like or sparse matrix of shape = [n_samples, n_features]
-            Input features matrix.
-        raw_score : bool, optional (default=False)
-            Whether to predict raw scores.
-        start_iteration : int, optional (default=0)
-            Start index of the iteration to predict.
-            If <= 0, starts from the first iteration.
-        num_iteration : int or None, optional (default=None)
-            Total number of iterations used in the prediction.
-            If None, if the best iteration exists and start_iteration <= 0, the best iteration is used;
-            otherwise, all iterations from ``start_iteration`` are used (no limits).
-            If <= 0, all iterations from ``start_iteration`` are used (no limits).
-        pred_leaf : bool, optional (default=False)
-            Whether to predict leaf index.
-        pred_contrib : bool, optional (default=False)
-            Whether to predict feature contributions.
-
-            .. note::
-
-                If you want to get more explanations for your model's predictions using SHAP values,
-                like SHAP interaction values,
-                you can install the shap package (https://github.com/slundberg/shap).
-                Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra
-                column, where the last column is the expected value.
-
-        **kwargs
-            Other parameters for the prediction.
-
-        Returns
-        -------
-        predicted_result : array-like of shape = [n_samples] or shape = [n_samples, n_classes]
-            The predicted values.
-        X_leaves : array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
-            If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
-        X_SHAP_values : array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects
-            If ``pred_contrib=True``, the feature contributions for each sample.
-        """
+        """Docstring is set after definition, using a template."""
         if self._n_features is None:
             raise LGBMNotFittedError("Estimator not fitted, call `fit` before exploiting the model.")
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):
@@ -704,6 +727,15 @@ def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None,
         return self._Booster.predict(X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration,
                                      pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs)
 
+    predict.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted value for each sample.",
+        X_shape="array-like or sparse matrix of shape = [n_samples, n_features]",
+        output_name="predicted_result",
+        predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
+        X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
+        X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects"
+    )
+
     @property
     def n_features_(self):
         """:obj:`int`: The number of features of fitted model."""
@@ -885,47 +917,7 @@ def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None,
 
     def predict_proba(self, X, raw_score=False, start_iteration=0, num_iteration=None,
                       pred_leaf=False, pred_contrib=False, **kwargs):
-        """Return the predicted probability for each class for each sample.
-
-        Parameters
-        ----------
-        X : array-like or sparse matrix of shape = [n_samples, n_features]
-            Input features matrix.
-        raw_score : bool, optional (default=False)
-            Whether to predict raw scores.
-        start_iteration : int, optional (default=0)
-            Start index of the iteration to predict.
-            If <= 0, starts from the first iteration.
-        num_iteration : int or None, optional (default=None)
-            Total number of iterations used in the prediction.
-            If None, if the best iteration exists and start_iteration <= 0, the best iteration is used;
-            otherwise, all iterations from ``start_iteration`` are used (no limits).
-            If <= 0, all iterations from ``start_iteration`` are used (no limits).
-        pred_leaf : bool, optional (default=False)
-            Whether to predict leaf index.
-        pred_contrib : bool, optional (default=False)
-            Whether to predict feature contributions.
-
-            .. note::
-
-                If you want to get more explanations for your model's predictions using SHAP values,
-                like SHAP interaction values,
-                you can install the shap package (https://github.com/slundberg/shap).
-                Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra
-                column, where the last column is the expected value.
-
-        **kwargs
-            Other parameters for the prediction.
-
-        Returns
-        -------
-        predicted_probability : array-like of shape = [n_samples, n_classes]
-            The predicted probability for each class for each sample.
-        X_leaves : array-like of shape = [n_samples, n_trees * n_classes]
-            If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
-        X_SHAP_values : array-like of shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects
-            If ``pred_contrib=True``, the feature contributions for each sample.
-        """
+        """Docstring is set after definition, using a template."""
         result = super().predict(X, raw_score, start_iteration, num_iteration, pred_leaf, pred_contrib, **kwargs)
         if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib):
             _log_warning("Cannot compute class probabilities or labels "
@@ -937,6 +929,15 @@ def predict_proba(self, X, raw_score=False, start_iteration=0, num_iteration=Non
         else:
             return np.vstack((1. - result, result)).transpose()
 
+    predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
+        description="Return the predicted probability for each class for each sample.",
+        X_shape="array-like or sparse matrix of shape = [n_samples, n_features]",
+        output_name="predicted_probability",
+        predicted_result_shape="array-like of shape = [n_samples, n_classes]",
+        X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
+        X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects"
+    )
+
     @property
     def classes_(self):
         """:obj:`array` of shape = [n_classes]: The class label array."""
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index c441466b3151..6ed56e87f800 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -575,7 +575,7 @@ def test_ranker(output, client, listen_port, group):
             group=group,
         )
 
-    # rebalance small dask.array dataset for better performance.
+    # rebalance small dask.Array dataset for better performance.
     if output == 'array':
         dX = dX.persist()
         dy = dy.persist()
@@ -584,7 +584,7 @@ def test_ranker(output, client, listen_port, group):
         _ = wait([dX, dy, dw, dg])
         client.rebalance()
 
-    # use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of
+    # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of
     # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
     params = {
         "random_state": 42,