diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 4f6637c6..7004c985 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -1,6 +1,9 @@ -"""Interface adapter for the Generalized Linear Model Regressor with Gaussian Link.""" +"""Interface adapter for the Generalized Linear Model Regressor.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) +__author__ = ["ShreeshaM07", "julian-fong"] + +import numpy as np import pandas as pd from skpro.regression.base import BaseProbaRegressor @@ -18,11 +21,45 @@ class GLMRegressor(BaseProbaRegressor): Parameters ---------- + family : string, default : "Normal" + The family parameter denotes the type of distribution + that will be used. + Available family/distributions are + 1."Normal" + 2."Poisson" + 3."Gamma" + link : string, default : None + This parameter is used to represent the link function to be + used with the distribution. + If default is None it will internally replace with default of the + respective family. The default is the first string + against each family below. + Available safe options for the respective family are: + ``Normal`` : "Identity", "Log", "InversePower"; + ``Poisson`` : "Log", "Identity", "Sqrt"; + ``Gamma`` : "InversePower", "Log", "Identity"; + offset_var : string or int, default = None + Pass the column name as a string or column number as an int in X. + If string, then the exog or ``X`` passed while ``fit``-ting + must have an additional column with column name passed through + ``offset_var`` with any values against each row. When ``predict``ing + have an additional column with name same as string passed through ``offset_var`` + in X with all the ``offset_var`` values for predicting + stored in the column for each row. + If ``int`` it corresponding column number will be considered. + exposure_var : string or int, default = None + Pass the column name as a string or column number as an int in X. + If string, then the exog or ``X`` passed while ``fit``-ting + must have an additional column with column name passed through + ``exposure_var`` with any values against each row. When ``predict``ing + have additional column with name same as string passed through ``exposure_var`` + in X with all the ``exposure_var`` values for predicting + stored in the column for each row. + If ``int`` it corresponding column number will be considered. missing : str Available options are 'none', 'drop' and 'raise'. If 'none', no nan checking is done. If 'drop', any observations with nans are dropped. If 'raise', an error is raised. Default = 'none' - start_params : array_like (optional) Initial guess of the solution for the loglikelihood maximization. The default is family-specific and is given by the @@ -157,8 +194,8 @@ class GLMRegressor(BaseProbaRegressor): """ _tags = { - "authors": ["julian-fong"], - "maintainers": ["julian-fong"], + "authors": ["ShreeshaM07", "julian-fong"], + "maintainers": ["ShreeshaM07", "julian-fong"], "python_version": None, "python_dependencies": "statsmodels", "capability:multioutput": False, @@ -167,6 +204,101 @@ class GLMRegressor(BaseProbaRegressor): "y_inner_mtype": "pd_DataFrame_Table", } + def _str_to_sm_family(self, family, link): + """Convert the string to a statsmodel object. + + If the link function is also explcitly mentioned then include then + that must be passed to the family/distribution object. + """ + from warnings import warn + + from statsmodels.genmod.families.family import Gamma, Gaussian, Poisson + from statsmodels.genmod.families.links import Identity, InversePower, Log, Sqrt + + sm_fmly = { + "Normal": Gaussian, + "Poisson": Poisson, + "Gamma": Gamma, + } + + links = { + "Log": Log, + "Identity": Identity, + "InversePower": InversePower, + "Sqrt": Sqrt, + } + + if link in links: + link_function = links[link]() + try: + return sm_fmly[family](link_function) + except Exception: + msg = "Invalid link for family, default link will be used" + warn(msg) + + return sm_fmly[family]() + + # TODO (release 2.4.0) + # replace the existing definition of `__init__` with + # the below definition for `__init__`. + # def __init__( + # self, + # family="Normal", + # link=None, + # offset_var=None, + # exposure_var=None, + # missing="none", + # start_params=None, + # maxiter=100, + # method="IRLS", + # tol=1e-8, + # scale=None, + # cov_type="nonrobust", + # cov_kwds=None, + # use_t=None, + # full_output=True, + # disp=False, + # max_start_irls=3, + # add_constant=False, + # ): + # super().__init__() + + # self.family = family + # self.link = link + # self.offset_var = offset_var + # self.exposure_var = exposure_var + # self.missing = missing + # self.start_params = start_params + # self.maxiter = maxiter + # self.method = method + # self.tol = tol + # self.scale = scale + # self.cov_type = cov_type + # self.cov_kwds = cov_kwds + # self.use_t = use_t + # self.full_output = full_output + # self.disp = disp + # self.max_start_irls = max_start_irls + # self.add_constant = add_constant + + # self._family = self.family + # self._link = self.link + # self._offset_var = self.offset_var + # self._exposure_var = self.exposure_var + # self._missing = self.missing + # self._start_params = self.start_params + # self._maxiter = self.maxiter + # self._method = self.method + # self._tol = self.tol + # self._scale = self.scale + # self._cov_type = self.cov_type + # self._cov_kwds = self.cov_kwds + # self._use_t = self.use_t + # self._full_output = self.full_output + # self._disp = self.disp + # self._max_start_irls = self.max_start_irls + # self._add_constant = self.add_constant + def __init__( self, missing="none", @@ -182,11 +314,21 @@ def __init__( disp=False, max_start_irls=3, add_constant=False, + family="Normal", + link=None, + offset_var=None, + exposure_var=None, ): + # The default values of the parameters + # are replaced with the changed sequence + # of parameters ranking for each of them + # from 0 to 16(total 17 parameters). super().__init__() - from statsmodels.genmod.families.family import Gaussian - self._family = Gaussian() + self.family = family + self.link = link + self.offset_var = offset_var + self.exposure_var = exposure_var self.missing = missing self.start_params = start_params self.maxiter = maxiter @@ -201,6 +343,37 @@ def __init__( self.max_start_irls = max_start_irls self.add_constant = add_constant + self._family = self.family + self._link = self.link + self._offset_var = self.offset_var + self._exposure_var = self.exposure_var + self._missing = self.missing + self._start_params = self.start_params + self._maxiter = self.maxiter + self._method = self.method + self._tol = self.tol + self._scale = self.scale + self._cov_type = self.cov_type + self._cov_kwds = self.cov_kwds + self._use_t = self.use_t + self._full_output = self.full_output + self._disp = self.disp + self._max_start_irls = self.max_start_irls + self._add_constant = self.add_constant + + from warnings import warn + + warn( + "Note: in `GLMRegressor`, the sequence of the parameters will change " + "in skpro version 2.5.0. It will be as per the order present in the" + "current docstring with the top one being the first parameter.\n" + "The defaults for the parameters will remain same and " + "there will be no changes.\n" + "Please use the `kwargs` calls instead of positional calls for the" + "parameters until the release of skpro 2.5.0 " + "as this will avoid any discrepancies." + ) + def _fit(self, X, y): """Fit regressor to training data. @@ -227,32 +400,43 @@ def _fit(self, X, y): """ from statsmodels.genmod.generalized_linear_model import GLM - X_ = self._prep_x(X) + # remove the offset and exposure columns which + # was inserted to maintain the shape + offset_var = self._offset_var + exposure_var = self._exposure_var + + X_ = self._prep_x(X, offset_var, exposure_var, False) y_col = y.columns + family = self._family + link = self._link + sm_family = self._str_to_sm_family(family=family, link=link) + glm_estimator = GLM( endog=y, exog=X_, - family=self._family, - missing=self.missing, + family=sm_family, + missing=self._missing, ) self._estimator = glm_estimator - fitted_glm_model = glm_estimator.fit( - self.start_params, - self.maxiter, - self.method, - self.tol, - self.scale, - self.cov_type, - self.cov_kwds, - self.use_t, - self.full_output, - self.disp, - self.max_start_irls, - ) + glm_fit_params = { + "start_params": self._start_params, + "maxiter": self._maxiter, + "method": self._method, + "tol": self._tol, + "scale": self._scale, + "cov_type": self._cov_type, + "cov_kwds": self._cov_kwds, + "use_t": self._use_t, + "full_output": self._full_output, + "disp": self._disp, + "max_start_irls": self._max_start_irls, + } + + fitted_glm_model = glm_estimator.fit(**glm_fit_params) PARAMS_TO_FORWARD = { "df_model_": glm_estimator.df_model, @@ -304,15 +488,63 @@ def _predict(self, X): ------- y : pandas DataFrame, same length as `X`, with same columns as y in fit """ - X_ = self._prep_x(X) + offset_var = self._offset_var + exposure_var = self._exposure_var + offset_arr = None + exposure_arr = None + + X_, offset_arr, exposure_arr = self._prep_x(X, offset_var, exposure_var, True) index = X_.index y_column = self.y_col - y_pred_series = self.glm_fit_.predict(X_) + y_pred_series = self.glm_fit_.predict( + X_, offset=offset_arr, exposure=exposure_arr + ) y_pred = pd.DataFrame(y_pred_series, index=index, columns=y_column) return y_pred + def _params_sm_to_skpro(self, y_predictions_df, index, columns, family): + """Convert the statsmodels output to equivalent skpro distribution.""" + from skpro.distributions.gamma import Gamma + from skpro.distributions.normal import Normal + from skpro.distributions.poisson import Poisson + + skpro_distr = { + "Normal": Normal, + "Poisson": Poisson, + "Gamma": Gamma, + } + + params = {} + skp_dist = Normal + + if family in skpro_distr: + skp_dist = skpro_distr[family] + + if skp_dist == Normal: + y_mu = y_predictions_df["mean"].rename("mu").to_frame() + y_sigma = y_predictions_df["mean_se"].rename("sigma").to_frame() + params["mu"] = y_mu + params["sigma"] = y_sigma + elif skp_dist == Poisson: + y_mu = y_predictions_df["mean"].rename("mu").to_frame() + params["mu"] = y_mu + elif skp_dist == Gamma: + y_mean = y_predictions_df["mean"] + y_sd = y_predictions_df["mean_se"] + y_alpha = (y_mean / y_sd) ** 2 + y_beta = (y_mean / (y_sd**2)).rename("beta").to_frame() + y_alpha = y_alpha.rename("alpha").to_frame() + params["alpha"] = y_alpha + params["beta"] = y_beta + + params["index"] = index + params["columns"] = columns + + y_pred = skp_dist(**params) + return y_pred + def _predict_proba(self, X): """Predict distribution over labels for data from features. @@ -332,30 +564,34 @@ def _predict_proba(self, X): y_pred : skpro BaseDistribution, same length as `X` labels predicted for `X` """ - from skpro.distributions.normal import Normal + # remove the offset and exposure columns + # which was inserted to maintain the shape + offset_var = self._offset_var + exposure_var = self._exposure_var - X_ = self._prep_x(X) + X_ = self._prep_x(X, offset_var, exposure_var, False) # instead of using the conventional predict() method, we use statsmodels # get_prediction method, which returns a pandas df that contains # the prediction and prediction variance i.e mu and sigma y_column = self.y_col y_predictions_df = self.glm_fit_.get_prediction(X_).summary_frame() - y_mu = y_predictions_df["mean"].rename("mu").to_frame() - y_sigma = y_predictions_df["mean_se"].rename("sigma").to_frame() - params = { - "mu": y_mu, - "sigma": y_sigma, - "index": X_.index, - "columns": y_column, - } - y_pred = Normal(**params) + + # convert the returned values to skpro equivalent distribution + family = self._family + index = X_.index + columns = y_column + + y_pred = self._params_sm_to_skpro(y_predictions_df, index, columns, family) return y_pred - def _prep_x(self, X): + def _prep_x(self, X, offset_var, exposure_var, rtn_off_exp_arr): """ Return a copy of X with an added constant of self.add_constant = True. + If rtn_off_exp_arr is True it will also return offset and exposure + arrays along with updated X. + Parameters ---------- X : pandas DataFrame @@ -366,13 +602,46 @@ def _prep_x(self, X): X_ : pandas DataFrame A copy of the input X with an added column 'const' with is an array of len(X) of 1s + offset_arr : numpy.array + The copy of column which is meant for offsetting present in X. + exposure_arr : numpy.array + The copy of column which is meant for exposure present in X. """ from statsmodels.tools import add_constant - if self.add_constant: + offset_arr = None + exposure_arr = None + if offset_var is not None: + if isinstance(offset_var, str): + offset_var = pd.Index([offset_var]) + offset_arr = np.array(X[offset_var]).flatten() + elif isinstance(offset_var, int): + offset_arr = np.array(X.iloc[:, offset_var]).flatten() + offset_var = pd.Index([X.iloc[:, offset_var].name]) + if exposure_var is not None: + if isinstance(exposure_var, str): + exposure_var = pd.Index([exposure_var]) + exposure_arr = np.array(X[exposure_var]).flatten() + elif isinstance(exposure_var, int): + exposure_arr = np.array(X.iloc[:, exposure_var]).flatten() + exposure_var = pd.Index([X.iloc[:, exposure_var].name]) + # drop the offset and exposure columns from X + columns_to_drop = [] + if offset_var is not None: + columns_to_drop.append(offset_var[0]) + if exposure_var is not None: + columns_to_drop.append(exposure_var[0]) + if columns_to_drop: + X = X.drop(columns_to_drop, axis=1) + + if self._add_constant: X_ = add_constant(X) + if rtn_off_exp_arr: + return X_, offset_arr, exposure_arr return X_ else: + if rtn_off_exp_arr: + return X, offset_arr, exposure_arr return X @classmethod @@ -395,5 +664,19 @@ def get_test_params(cls, parameter_set="default"): """ params1 = {} params2 = {"add_constant": True} + params3 = { + "family": "Poisson", + "add_constant": True, + } + params4 = {"family": "Gamma"} + params5 = { + "family": "Normal", + "link": "InversePower", + } + params6 = { + "family": "Poisson", + "link": "Log", + "add_constant": True, + } - return [params1, params2] + return [params1, params2, params3, params4, params5, params6] diff --git a/skpro/regression/tests/test_glm.py b/skpro/regression/tests/test_glm.py new file mode 100644 index 00000000..b1435dc4 --- /dev/null +++ b/skpro/regression/tests/test_glm.py @@ -0,0 +1,60 @@ +"""Tests Generalized Linear Model regressor.""" + +import pandas as pd +import pytest + +from skpro.regression.linear import GLMRegressor +from skpro.tests.test_switch import run_test_for_class + + +@pytest.mark.skipif( + not run_test_for_class(GLMRegressor), + reason="run test only if softdeps are present and incrementally (if requested)", +) +def test_glm_simple_use(): + """Test simple use of GLM regressor.""" + from sklearn.datasets import load_diabetes + from sklearn.model_selection import train_test_split + + X, y = load_diabetes(return_X_y=True, as_frame=True) + y = pd.DataFrame(y) + X = X.iloc[:200] + y = y.iloc[:200] + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + glm_reg = GLMRegressor() + glm_reg.fit(X_train, y_train) + y_pred = glm_reg.predict(X_test) + y_pred_proba = glm_reg.predict_proba(X_test) + + assert y_pred.shape == y_test.shape + assert y_pred_proba.shape == y_test.shape + + +@pytest.mark.skipif( + not run_test_for_class(GLMRegressor), + reason="run test only if softdeps are present and incrementally (if requested)", +) +def test_glm_with_offset_exposure(): + """Test GLM with offset_var and exposure_var parameters.""" + import numpy as np + from sklearn.datasets import load_diabetes + from sklearn.model_selection import train_test_split + + X, y = load_diabetes(return_X_y=True, as_frame=True) + y = pd.DataFrame(y) + X = X.iloc[:200] + y = y.iloc[:200] + X["off"] = np.ones(X.shape[0]) * 2.1 + X["exp"] = np.arange(1, X.shape[0] + 1) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + glm_reg = GLMRegressor( + family="Normal", link="Log", offset_var="off", exposure_var=-1 + ) + glm_reg.fit(X_train, y_train) + y_pred = glm_reg.predict(X_test) + y_pred_proba = glm_reg.predict_proba(X_test) + + assert y_pred.shape == y_test.shape + assert y_pred_proba.shape == y_test.shape