diff --git a/pygam/__init__.py b/pygam/__init__.py index de02a23e..3cea2f93 100644 --- a/pygam/__init__.py +++ b/pygam/__init__.py @@ -21,4 +21,4 @@ __all__ = ['GAM', 'LinearGAM', 'LogisticGAM', 'GammaGAM', 'PoissonGAM', 'InvGaussGAM', 'ExpectileGAM', 'l', 's', 'f', 'te', 'intercept'] -__version__ = '0.6.1' +__version__ = '0.6.2' diff --git a/pygam/pygam.py b/pygam/pygam.py index c2f18625..8bc372fd 100644 --- a/pygam/pygam.py +++ b/pygam/pygam.py @@ -1614,7 +1614,7 @@ def summary(self): term_data = { 'feature_func': repr(term), - 'lam': np.round(flatten(term.lam), 4), + 'lam': '' if term.isintercept else np.round(flatten(term.lam), 4), 'rank': '{}'.format(term.n_coefs), 'edof': '{}'.format(edof), 'p_value': '%.2e'%(self.statistics_['p_values'][i]), @@ -1690,12 +1690,16 @@ def gridsearch(self, X, y, weights=None, return_scores=False, progress : bool, default: True whether to display a progress bar - **kwargs : dict, default {'lam': np.logspace(-3, 3, 11)} + **kwargs : dict, default `lam=np.logspace(-3, 3, 11)`} pairs of parameters and iterables of floats, or parameters and iterables of iterables of floats. - if iterable of iterables of floats, the outer iterable must have - length m_features. + if grid is iterable of iterables of floats, + the outer iterable must have length m_features. + the cartesian product of the subgrids in the grid will be tested. + + if grid is a 2d numpy array, + each row of the array will be tested. the method will make a grid of all the combinations of the parameters and fit a GAM to each combination. @@ -1709,6 +1713,29 @@ def gridsearch(self, X, y, weights=None, return_scores=False, objective scores as values else: self, ie possibly the newly fitted model + + Examples + -------- + For a model with 3 terms, and where we expect 3 lam values, + our search space for lam must have 3 dimensions. + + However we can search the space in 2 ways: + - via cartesian product by specifying the grid as a list + our grid search will consider 11 ** 3 points + + >>> lam = np.logspace(-3, 3, 11) + >>> lams = [lam] * 3 + >>> gam.gridsearch(X, y, lam=lams) + + - directly by specifying the grid as a np.ndarray + our gridsearch will consider 11 points + + >>> lam = np.logspace(-3, 3, 11) + >>> lams = np.array([lam] * 3) + >>> gam.gridsearch(X, y, lam=lams) + + the latter is useful for when the dimensionality of the search space + is very large, and we would prefer to execute a randomized search. """ # check if model fitted if not self._is_fitted: @@ -1758,9 +1785,11 @@ def gridsearch(self, X, y, weights=None, return_scores=False, grids = [] for param, grid in list(param_grids.items()): + # check param exists if param not in (admissible_params): raise ValueError('unknown parameter: {}'.format(param)) + # check grid is iterable at all if not (isiterable(grid) and (len(grid) > 1)): \ raise ValueError('{} grid must either be iterable of ' 'iterables, or an iterable of lengnth > 1, '\ @@ -1768,11 +1797,26 @@ def gridsearch(self, X, y, weights=None, return_scores=False, # prepare grid if any(isiterable(g) for g in grid): - # cast to np.array + + # get required parameter shape + target_len = len(flatten(getattr(self, param))) + + # check if cartesian product needed + cartesian = (not isinstance(grid, np.ndarray) or grid.ndim != 2) + + # build grid grid = [np.atleast_1d(g) for g in grid] - # set grid to combination of all grids - grid = combine(*grid) + # check chape + msg = '{} grid should have {} columns, '\ + 'but found grid with {} columns'.format(param, target_len, len(grid)) + if cartesian: + if len(grid) != target_len: + raise ValueError(msg) + grid = combine(*grid) + + if not all([len(subgrid) == target_len for subgrid in grid]): + raise ValueError(msg) # save param name and grid params.append(param) @@ -1852,8 +1896,26 @@ def gridsearch(self, X, y, weights=None, return_scores=False, else: return self + def randomsearch(self, X, y, weights=None, n=20, **param_grids): + + if not self._is_fitted: + self._validate_params() + self._validate_data_dep_params(X) + + # if no params, then set up default search + if not bool(param_grids): + lam = np.random.rand(n, len(flatten(self.lam))) * 6 - 3 + lam = np.exp(lam) + param_grids['lam'] = lam + + # TODO what should happen when we specify more than one parameter? + # how do we search the random space of all parameters + + return self.gridsearch(X, y, weights=weights, **param_grids) + + def sample(self, X, y, quantity='y', sample_at_X=None, - weights=None, n_draws=100, n_bootstraps=1, objective='auto'): + weights=None, n_draws=100, n_bootstraps=5, objective='auto'): """Simulate from the posterior of the coefficients and smoothing params. Samples are drawn from the posterior of the coefficients and smoothing @@ -1882,11 +1944,6 @@ def sample(self, X, y, quantity='y', sample_at_X=None, `n_bootstraps` small. Make `n_bootstraps < n_draws` to take advantage of the expensive bootstrap samples of the smoothing parameters. - NOTE: For now, the grid of `lam` values is the default of `gridsearch`. - Until randomized grid search is implemented, it is not worth setting - `n_bootstraps` to a value greater than one because the smoothing - parameters will be identical in each bootstrap sample. - Parameters ----------- X : array of shape (n_samples, m_features) @@ -1915,7 +1972,7 @@ def sample(self, X, y, quantity='y', sample_at_X=None, The number of samples to draw from the posterior distribution of the coefficients and smoothing parameters - n_bootstraps : positive int, default: 1 + n_bootstraps : positive int, default: 5 The number of bootstrap samples to draw from simulations of the response (from the already fitted model) to estimate the distribution of the smoothing parameters given the response data. @@ -1955,6 +2012,7 @@ def sample(self, X, y, quantity='y', sample_at_X=None, coef_draws = self._sample_coef( X, y, weights=weights, n_draws=n_draws, n_bootstraps=n_bootstraps, objective=objective) + if quantity == 'coef': return coef_draws @@ -1977,8 +2035,6 @@ def _sample_coef(self, X, y, weights=None, n_draws=100, n_bootstraps=1, `n_bootstraps` small. Make `n_bootstraps < n_draws` to take advantage of the expensive bootstrap samples of the smoothing parameters. - For now, the grid of `lam` values is the default of `gridsearch`. - Parameters ----------- X : array of shape (n_samples, m_features) @@ -2039,7 +2095,14 @@ def _sample_coef(self, X, y, weights=None, n_draws=100, n_bootstraps=1, def _bootstrap_samples_of_smoothing(self, X, y, weights=None, n_bootstraps=1, objective='auto'): - """Sample the smoothing parameters using simulated response data.""" + """Sample the smoothing parameters using simulated response data. + + + For now, the grid of `lam` values is 11 random points in M-dimensional + space, where M = the number of lam values, ie len(flatten(gam.lam)) + + all values are in [1e-3, 1e3] + """ mu = self.predict_mu(X) # Wood pg. 198 step 1 coef_bootstraps = [self.coef_] cov_bootstraps = [ @@ -2059,7 +2122,12 @@ def _bootstrap_samples_of_smoothing(self, X, y, weights=None, # `n_bootstraps > 1`. gam = deepcopy(self) gam.set_params(self.get_params()) - gam.gridsearch(X, y_bootstrap, weights=weights, + + # create a random search of 11 points in lam space + # with all values in [1e-3, 1e3] + lam_grid = np.random.randn(11, len(flatten(self.lam))) * 6 - 3 + lam_grid = np.exp(lam_grid) + gam.gridsearch(X, y_bootstrap, weights=weights, lam=lam_grid, objective=objective) lam = gam.lam diff --git a/pygam/terms.py b/pygam/terms.py index 8945ebb7..9af00f01 100644 --- a/pygam/terms.py +++ b/pygam/terms.py @@ -416,7 +416,7 @@ def __init__(self, verbose=False): self._name = 'intercept_term' self._minimal_name = 'intercept' - super(Intercept, self).__init__(feature=None, fit_linear=False, fit_splines=False, lam=0, penalties=None, constraints=None, verbose=verbose) + super(Intercept, self).__init__(feature=None, fit_linear=False, fit_splines=False, lam=None, penalties=None, constraints=None, verbose=verbose) self._exclude += ['fit_splines', 'fit_linear', 'lam', 'penalties', 'constraints', 'feature', 'dtype'] self._args = [] diff --git a/pygam/tests/test_gridsearch.py b/pygam/tests/test_gridsearch.py index d6e24c93..c638f3ad 100644 --- a/pygam/tests/test_gridsearch.py +++ b/pygam/tests/test_gridsearch.py @@ -77,7 +77,7 @@ def test_gridsearch_all_dimensions_independent(cake_X_y): """ check that gridsearch searches all dimensions of lambda independently """ - n = 3 + n = 4 X, y = cake_X_y m = X.shape[1] @@ -88,6 +88,44 @@ def test_gridsearch_all_dimensions_independent(cake_X_y): assert(len(scores) == n**m) assert(m > 1) +def test_no_cartesian_product(cake_X_y): + """ + check that gridsearch does not do a cartesian product when a 2D numpy array is + passed as the grid and the number of columns matches the len of the parameter + """ + n = 5 + X, y = cake_X_y + m = X.shape[1] + + lams = np.array([np.logspace(-3,3, n)]*m).T + assert lams.shape == (n, m) + + scores = LinearGAM().gridsearch(X, y, + lam=lams, + return_scores=True) + + assert(len(scores) == n) + assert(m > 1) + +def test_wrong_grid_shape(cake_X_y): + """ + check that gridsearch raises a ValueError when the grid shape cannot be interpretted + """ + X, y = cake_X_y + lams = np.random.rand(50, X.shape[1] + 1) + + with pytest.raises(ValueError): + scores = LinearGAM().gridsearch(X, y, + lam=lams, + return_scores=True) + + lams = lams.T.tolist() + assert len(lams) == X.shape[1] + 1 + with pytest.raises(ValueError): + scores = LinearGAM().gridsearch(X, y, + lam=lams, + return_scores=True) + def test_GCV_objective_is_for_unknown_scale(mcycle_X_y, default_X_y, coal_X_y, trees_X_y): """ check that we use the GCV objective only for models with unknown scale