From b9c95df2d52e90ecf6bc3d8cc7c6dc65ba5797f2 Mon Sep 17 00:00:00 2001 From: dswah Date: Fri, 14 Sep 2018 02:15:49 +0200 Subject: [PATCH 1/4] gridsearch allows no cartesian product, sampling exploits this to do 5 bootstraps --- pygam/pygam.py | 104 +++++++++++++++++++++++++++------ pygam/tests/test_gridsearch.py | 35 +++++++++++ 2 files changed, 121 insertions(+), 18 deletions(-) diff --git a/pygam/pygam.py b/pygam/pygam.py index c2f18625..ee332c50 100644 --- a/pygam/pygam.py +++ b/pygam/pygam.py @@ -1690,12 +1690,16 @@ def gridsearch(self, X, y, weights=None, return_scores=False, progress : bool, default: True whether to display a progress bar - **kwargs : dict, default {'lam': np.logspace(-3, 3, 11)} + **kwargs : dict, default `lam=np.logspace(-3, 3, 11)`} pairs of parameters and iterables of floats, or parameters and iterables of iterables of floats. - if iterable of iterables of floats, the outer iterable must have - length m_features. + if grid is iterable of iterables of floats, + the outer iterable must have length m_features. + the cartesian product of the subgrids in the grid will be tested. + + if grid is a 2d numpy array, + each row of the array will be tested. the method will make a grid of all the combinations of the parameters and fit a GAM to each combination. @@ -1709,6 +1713,29 @@ def gridsearch(self, X, y, weights=None, return_scores=False, objective scores as values else: self, ie possibly the newly fitted model + + Examples + -------- + For a model with 3 terms, and where we expect 3 lam values, + our search space for lam must have 3 dimensions. + + However we can search the space in 2 ways: + - via cartesian product by specifying the grid as a list + our grid search will consider 11 ** 3 points + + >>> lam = np.logspace(-3, 3, 11) + >>> lams = [lam] * 3 + >>> gam.gridsearch(X, y, lam=lams) + + - directly by specifying the grid as a np.ndarray + our gridsearch will consider 11 points + + >>> lam = np.logspace(-3, 3, 11) + >>> lams = np.array([lam] * 3) + >>> gam.gridsearch(X, y, lam=lams) + + the latter is useful for when the dimensionality of the search space + is very large, and we would prefer to execute a randomized search. """ # check if model fitted if not self._is_fitted: @@ -1768,11 +1795,28 @@ def gridsearch(self, X, y, weights=None, return_scores=False, # prepare grid if any(isiterable(g) for g in grid): - # cast to np.array - grid = [np.atleast_1d(g) for g in grid] - # set grid to combination of all grids - grid = combine(*grid) + target_len = len(flatten(getattr(self, param))) + + # check if no cartesian product needed + if isinstance(grid, np.ndarray) and grid.ndim == 2: + if grid.shape[1] == target_len: + # no cartesian product needed + grid = grid.T.tolist() + else: + raise ValueError('{} grid should have {} columns, '\ + 'but found grid with shape {}'.format(param, target_len, grid.shape)) + + # do cartesian product + else: + # cast to np.array + grid = [np.atleast_1d(g) for g in grid] + if len(grid) == target_len: + # cartesian product of subgrids + grid = combine(*grid) + else: + raise ValueError('{} grid should have length {}, '\ + 'but found grid of length {}'.format(param, target_len, len(grid))) # save param name and grid params.append(param) @@ -1852,8 +1896,26 @@ def gridsearch(self, X, y, weights=None, return_scores=False, else: return self + def randomsearch(self, X, y, weights=None, n=20, **param_grids): + + if not self._is_fitted: + self._validate_params() + self._validate_data_dep_params(X) + + # if no params, then set up default search + if not bool(param_grids): + lam = np.random.rand(n, len(flatten(self.lam))) * 6 - 3 + lam = np.exp(lam) + param_grids['lam'] = lam + + # TODO what should happen when we specify more than one parameter? + # how do we search the random space of all parameters + + return self.gridsearch(X, y, weights=weights, **param_grids) + + def sample(self, X, y, quantity='y', sample_at_X=None, - weights=None, n_draws=100, n_bootstraps=1, objective='auto'): + weights=None, n_draws=100, n_bootstraps=5, objective='auto'): """Simulate from the posterior of the coefficients and smoothing params. Samples are drawn from the posterior of the coefficients and smoothing @@ -1882,11 +1944,6 @@ def sample(self, X, y, quantity='y', sample_at_X=None, `n_bootstraps` small. Make `n_bootstraps < n_draws` to take advantage of the expensive bootstrap samples of the smoothing parameters. - NOTE: For now, the grid of `lam` values is the default of `gridsearch`. - Until randomized grid search is implemented, it is not worth setting - `n_bootstraps` to a value greater than one because the smoothing - parameters will be identical in each bootstrap sample. - Parameters ----------- X : array of shape (n_samples, m_features) @@ -1915,7 +1972,7 @@ def sample(self, X, y, quantity='y', sample_at_X=None, The number of samples to draw from the posterior distribution of the coefficients and smoothing parameters - n_bootstraps : positive int, default: 1 + n_bootstraps : positive int, default: 5 The number of bootstrap samples to draw from simulations of the response (from the already fitted model) to estimate the distribution of the smoothing parameters given the response data. @@ -1955,6 +2012,7 @@ def sample(self, X, y, quantity='y', sample_at_X=None, coef_draws = self._sample_coef( X, y, weights=weights, n_draws=n_draws, n_bootstraps=n_bootstraps, objective=objective) + if quantity == 'coef': return coef_draws @@ -1977,8 +2035,6 @@ def _sample_coef(self, X, y, weights=None, n_draws=100, n_bootstraps=1, `n_bootstraps` small. Make `n_bootstraps < n_draws` to take advantage of the expensive bootstrap samples of the smoothing parameters. - For now, the grid of `lam` values is the default of `gridsearch`. - Parameters ----------- X : array of shape (n_samples, m_features) @@ -2039,7 +2095,14 @@ def _sample_coef(self, X, y, weights=None, n_draws=100, n_bootstraps=1, def _bootstrap_samples_of_smoothing(self, X, y, weights=None, n_bootstraps=1, objective='auto'): - """Sample the smoothing parameters using simulated response data.""" + """Sample the smoothing parameters using simulated response data. + + + For now, the grid of `lam` values is 11 random points in M-dimensional + space, where M = the number of lam values, ie len(flatten(gam.lam)) + + all values are in [1e-3, 1e3] + """ mu = self.predict_mu(X) # Wood pg. 198 step 1 coef_bootstraps = [self.coef_] cov_bootstraps = [ @@ -2059,7 +2122,12 @@ def _bootstrap_samples_of_smoothing(self, X, y, weights=None, # `n_bootstraps > 1`. gam = deepcopy(self) gam.set_params(self.get_params()) - gam.gridsearch(X, y_bootstrap, weights=weights, + + # create a random search of 11 points in lam space + # with all values in [1e-3, 1e3] + lam_grid = np.random.randn(11, len(flatten(self.lam))) * 6 - 3 + lam_grid = np.exp(lam_grid) + gam.gridsearch(X, y_bootstrap, weights=weights, lam=lam_grid, objective=objective) lam = gam.lam diff --git a/pygam/tests/test_gridsearch.py b/pygam/tests/test_gridsearch.py index d6e24c93..9bb6a97e 100644 --- a/pygam/tests/test_gridsearch.py +++ b/pygam/tests/test_gridsearch.py @@ -88,6 +88,41 @@ def test_gridsearch_all_dimensions_independent(cake_X_y): assert(len(scores) == n**m) assert(m > 1) +def test_no_cartesian_product(cake_X_y): + """ + check that gridsearch does not do a cartesian product when a 2D numpy array is + passed as the grid and the number of columns matches the len of the parameter + """ + n = 3 + X, y = cake_X_y + m = X.shape[1] + + scores = LinearGAM().gridsearch(X, y, + lam=np.array([np.logspace(-3,3, n)]*m), + return_scores=True) + + assert(len(scores) == n) + assert(m > 1) + +def test_wrong_grid_shape(cake_X_y): + """ + check that gridsearch raises a ValueError when the grid shape cannot be interpretted + """ + X, y = cake_X_y + lams = np.random.rand(50, X.shape[1] + 1) + + with pytest.raises(ValueError): + scores = LinearGAM().gridsearch(X, y, + lam=lams, + return_scores=True) + + lams = lams.T.tolist() + assert len(lams) == X.shape[1] + 1 + with pytest.raises(ValueError): + scores = LinearGAM().gridsearch(X, y, + lam=lams, + return_scores=True) + def test_GCV_objective_is_for_unknown_scale(mcycle_X_y, default_X_y, coal_X_y, trees_X_y): """ check that we use the GCV objective only for models with unknown scale From c996fb3b6631e7719526fe17ee71309a604414ef Mon Sep 17 00:00:00 2001 From: dswah Date: Fri, 14 Sep 2018 17:16:12 +0200 Subject: [PATCH 2/4] fix bug in grid shapes --- pygam/pygam.py | 38 +++++++++++++++++----------------- pygam/tests/test_gridsearch.py | 9 +++++--- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/pygam/pygam.py b/pygam/pygam.py index ee332c50..158899e5 100644 --- a/pygam/pygam.py +++ b/pygam/pygam.py @@ -1785,9 +1785,11 @@ def gridsearch(self, X, y, weights=None, return_scores=False, grids = [] for param, grid in list(param_grids.items()): + # check param exists if param not in (admissible_params): raise ValueError('unknown parameter: {}'.format(param)) + # check grid is iterable at all if not (isiterable(grid) and (len(grid) > 1)): \ raise ValueError('{} grid must either be iterable of ' 'iterables, or an iterable of lengnth > 1, '\ @@ -1796,27 +1798,25 @@ def gridsearch(self, X, y, weights=None, return_scores=False, # prepare grid if any(isiterable(g) for g in grid): + # get required parameter shape target_len = len(flatten(getattr(self, param))) - # check if no cartesian product needed - if isinstance(grid, np.ndarray) and grid.ndim == 2: - if grid.shape[1] == target_len: - # no cartesian product needed - grid = grid.T.tolist() - else: - raise ValueError('{} grid should have {} columns, '\ - 'but found grid with shape {}'.format(param, target_len, grid.shape)) - - # do cartesian product - else: - # cast to np.array - grid = [np.atleast_1d(g) for g in grid] - if len(grid) == target_len: - # cartesian product of subgrids - grid = combine(*grid) - else: - raise ValueError('{} grid should have length {}, '\ - 'but found grid of length {}'.format(param, target_len, len(grid))) + # check if cartesian product needed + cartesian = (not isinstance(grid, np.ndarray) or grid.ndim != 2) + + # build grid + grid = [np.atleast_1d(g) for g in grid] + + # check chape + msg = '{} grid should have {} columns, '\ + 'but found grid with {} columns'.format(param, target_len, len(grid)) + if cartesian: + if len(grid) != target_len: + raise ValueError(msg) + grid = combine(*grid) + + if not all([len(subgrid) == target_len for subgrid in grid]): + raise ValueError(msg) # save param name and grid params.append(param) diff --git a/pygam/tests/test_gridsearch.py b/pygam/tests/test_gridsearch.py index 9bb6a97e..c638f3ad 100644 --- a/pygam/tests/test_gridsearch.py +++ b/pygam/tests/test_gridsearch.py @@ -77,7 +77,7 @@ def test_gridsearch_all_dimensions_independent(cake_X_y): """ check that gridsearch searches all dimensions of lambda independently """ - n = 3 + n = 4 X, y = cake_X_y m = X.shape[1] @@ -93,12 +93,15 @@ def test_no_cartesian_product(cake_X_y): check that gridsearch does not do a cartesian product when a 2D numpy array is passed as the grid and the number of columns matches the len of the parameter """ - n = 3 + n = 5 X, y = cake_X_y m = X.shape[1] + lams = np.array([np.logspace(-3,3, n)]*m).T + assert lams.shape == (n, m) + scores = LinearGAM().gridsearch(X, y, - lam=np.array([np.logspace(-3,3, n)]*m), + lam=lams, return_scores=True) assert(len(scores) == n) From 73df735256256058e800b13a58a7df6900087c09 Mon Sep 17 00:00:00 2001 From: dswah Date: Fri, 14 Sep 2018 17:16:56 +0200 Subject: [PATCH 3/4] intercept should have lam=None, summary prints blank for intercept lam --- pygam/pygam.py | 2 +- pygam/terms.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pygam/pygam.py b/pygam/pygam.py index 158899e5..8bc372fd 100644 --- a/pygam/pygam.py +++ b/pygam/pygam.py @@ -1614,7 +1614,7 @@ def summary(self): term_data = { 'feature_func': repr(term), - 'lam': np.round(flatten(term.lam), 4), + 'lam': '' if term.isintercept else np.round(flatten(term.lam), 4), 'rank': '{}'.format(term.n_coefs), 'edof': '{}'.format(edof), 'p_value': '%.2e'%(self.statistics_['p_values'][i]), diff --git a/pygam/terms.py b/pygam/terms.py index 8945ebb7..9af00f01 100644 --- a/pygam/terms.py +++ b/pygam/terms.py @@ -416,7 +416,7 @@ def __init__(self, verbose=False): self._name = 'intercept_term' self._minimal_name = 'intercept' - super(Intercept, self).__init__(feature=None, fit_linear=False, fit_splines=False, lam=0, penalties=None, constraints=None, verbose=verbose) + super(Intercept, self).__init__(feature=None, fit_linear=False, fit_splines=False, lam=None, penalties=None, constraints=None, verbose=verbose) self._exclude += ['fit_splines', 'fit_linear', 'lam', 'penalties', 'constraints', 'feature', 'dtype'] self._args = [] From cf390541100af8b53c5c7a8696f720f511a75036 Mon Sep 17 00:00:00 2001 From: dswah Date: Fri, 14 Sep 2018 17:17:06 +0200 Subject: [PATCH 4/4] bump patch number --- pygam/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygam/__init__.py b/pygam/__init__.py index de02a23e..3cea2f93 100644 --- a/pygam/__init__.py +++ b/pygam/__init__.py @@ -21,4 +21,4 @@ __all__ = ['GAM', 'LinearGAM', 'LogisticGAM', 'GammaGAM', 'PoissonGAM', 'InvGaussGAM', 'ExpectileGAM', 'l', 's', 'f', 'te', 'intercept'] -__version__ = '0.6.1' +__version__ = '0.6.2'