sktime · fkiraly · Aug 26, 2023 · Aug 26, 2023 · Aug 26, 2023 · Aug 26, 2023
@@ -114,7 +114,15 @@ def _predict(self, X):
         y : pandas DataFrame, same length as `X`
             labels predicted for `X`
         """
-        raise NotImplementedError
+        implements_proba = self._has_implementation_of("_predict_proba")
+
+        if not implements_proba:
+            raise NotImplementedError
+
+        if implements_proba:
+            pred_proba = self._predict_proba(X=X)
+            pred_mean = pred_proba.mean()
+            return pred_mean
 
     def predict_proba(self, X):
         """Predict distribution over labels for data from features.

@@ -0,0 +1,196 @@
+# -*- coding: utf-8 -*-
+"""Probabilistic regression by bootstrap."""
+
+__author__ = ["fkiraly"]
+__all__ = ["BootstrapRegressor"]
+
+import numpy as np
+import pandas as pd
+from sklearn import clone
+
+from skpro.distributions.empirical import Empirical
+from skpro.regression.base import BaseProbaRegressor
+
+
+class BootstrapRegressor(BaseProbaRegressor):
+    """Bootstrap ensemble of a tabular regressor.
+
+    Fits ``n_estimators`` clones of an skpro regressor on
+    datasets which are bootstrap sub-samples, i.e.,
+    independent row samples with replacement.
+
+    On ``predict_proba``, an empirical distribution with the bootstrap
+    sample is returned.
+
+    The estimator allows to choose sample sizes for instances, variables,
+    and whether sampling is with or without replacement.
+
+    Direct generalization of ``sklearn``'s ``BaggingClassifier``
+    to the probabilistic regrsesion task.
+
+    Parameters
+    ----------
+    estimator : sklearn regressor
+        regressor to use in the bootstrap
+    n_bootstrap_samples : int, default=100
+        The number of bootstrap samples drawn
+        If int, then indicates number of instances precisely
+        Note: this is not the same as the size of each bootstrap sample.
+        The size of the bootstrap sample is always equal to X.
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, ``random_state`` is the seed used by the random number generator;
+        If ``RandomState`` instance, ``random_state`` is the random number generator;
+        If None, the random number generator is the ``RandomState`` instance used
+        by ``np.random``.
+
+    Attributes
+    ----------
+    estimators_ : list of of skpro regressors
+        clones of regressor in `estimator` fitted in the ensemble
+
+    Examples
+    --------
+    >>> from skpro.regression.bootstrap import BootstrapRegressor
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from sklearn.datasets import load_diabetes
+    >>> from sklearn.model_selection import train_test_split
+    >>>
+    >>> X, y = load_diabetes(return_X_y=True, as_frame=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y)
+    >>>
+    >>> reg_tabular = LinearRegression()
+    >>>
+    >>> reg_proba = BootstrapRegressor(reg_tabular)
+    >>> reg_proba.fit(X_train, y_train)
+    BootstrapRegressor(...)
+    >>> y_pred = reg_proba.predict_proba(X_test)
+    """
+
+    _tags = {"capability:missing": True}
+
+    def __init__(
+        self,
+        estimator,
+        n_bootstrap_samples=100,
+        random_state=None,
+    ):
+        self.estimator = estimator
+        self.n_bootstrap_samples = n_bootstrap_samples
+        self.random_state = random_state
+
+        super().__init__()
+
+        # todo: find the equivalent tag in sklearn for missing data handling
+        # tags_to_clone = ["capability:missing"]
+        # self.clone_tags(estimator, tags_to_clone)
+
+    def _fit(self, X, y):
+        """Fit regressor to training data.
+
+        Writes to self:
+            Sets fitted model attributes ending in "_".
+
+        Parameters
+        ----------
+        X : pandas DataFrame
+            feature instances to fit regressor to
+        y : pandas DataFrame, must be same length as X
+            labels to fit regressor to
+
+        Returns
+        -------
+        self : reference to self
+        """
+        estimator = self.estimator
+        n_bootstrap_samples = self.n_bootstrap_samples
+        np.random.seed(self.random_state)
+
+        inst_ix = X.index
+        n = len(inst_ix)
+
+        self.estimators_ = []
+        self._cols = y.columns
+
+        for _i in range(n_bootstrap_samples):
+            esti = clone(estimator)
+            row_iloc = pd.RangeIndex(n)
+            row_ss = _random_ss_ix(row_iloc, size=n, replace=True)
+            inst_ix_i = inst_ix[row_ss]
+
+            Xi = X.loc[inst_ix_i]
+            Xi = Xi.reset_index(drop=True)
+
+            yi = y.loc[inst_ix_i].reset_index(drop=True)
+
+            self.estimators_ += [esti.fit(Xi, yi)]
+
+        return self
+
+    def _predict_proba(self, X) -> np.ndarray:
+        """Predict distribution over labels for data from features.
+
+        State required:
+            Requires state to be "fitted".
+
+        Accesses in self:
+            Fitted model attributes ending in "_"
+
+        Parameters
+        ----------
+        X : pandas DataFrame, must have same columns as X in `fit`
+            data to predict labels for
+
+        Returns
+        -------
+        y : skpro BaseDistribution, same length as `X`
+            labels predicted for `X`
+        """
+        cols = self._cols
+        y_preds = [est.predict(X) for est in self.estimators_]
+
+        def _coerce_df(x):
+            if not isinstance(x, pd.DataFrame):
+                x = pd.DataFrame(x, columns=cols, index=X.index)
+            return x
+
+        y_preds = [_coerce_df(x) for x in y_preds]
+
+        y_pred_df = pd.concat(y_preds, axis=0, keys=range(len(y_preds)))
+
+        y_proba = Empirical(y_pred_df)
+        return y_proba
+
+    @classmethod
+    def get_test_params(cls, parameter_set="default"):
+        """Return testing parameter settings for the estimator.
+
+        Parameters
+        ----------
+        parameter_set : str, default="default"
+            Name of the set of test parameters to return, for use in tests. If no
+            special parameters are defined for a value, will return `"default"` set.
+
+        Returns
+        -------
+        params : dict or list of dict, default = {}
+            Parameters to create testing instances of the class
+            Each dict are parameters to construct an "interesting" test instance, i.e.,
+            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
+            `create_test_instance` uses the first (or only) dictionary in `params`
+        """
+        from sklearn.linear_model import LinearRegression
+
+        params1 = {"estimator": LinearRegression()}
+        params2 = {
+            "estimator": LinearRegression(),
+            "n_bootstrap_samples": 10,
+        }
+
+        return [params1, params2]
+
+
+def _random_ss_ix(ix, size, replace=True):
+    """Randomly uniformly sample indices from a list of indices."""
+    a = range(len(ix))
+    ixs = ix[np.random.choice(a, size=size, replace=replace)]
+    return ixs