Skip to content

Commit

Permalink
Merge pull request #200 from dswah/grid
Browse files Browse the repository at this point in the history
[WIP] improve Gridsearch
  • Loading branch information
dswah authored Sep 17, 2018
2 parents 75701bc + cf39054 commit 58df6e9
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 21 deletions.
2 changes: 1 addition & 1 deletion pygam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@
__all__ = ['GAM', 'LinearGAM', 'LogisticGAM', 'GammaGAM', 'PoissonGAM',
'InvGaussGAM', 'ExpectileGAM', 'l', 's', 'f', 'te', 'intercept']

__version__ = '0.6.1'
__version__ = '0.6.2'
104 changes: 86 additions & 18 deletions pygam/pygam.py
Original file line number Diff line number Diff line change
Expand Up @@ -1614,7 +1614,7 @@ def summary(self):

term_data = {
'feature_func': repr(term),
'lam': np.round(flatten(term.lam), 4),
'lam': '' if term.isintercept else np.round(flatten(term.lam), 4),
'rank': '{}'.format(term.n_coefs),
'edof': '{}'.format(edof),
'p_value': '%.2e'%(self.statistics_['p_values'][i]),
Expand Down Expand Up @@ -1690,12 +1690,16 @@ def gridsearch(self, X, y, weights=None, return_scores=False,
progress : bool, default: True
whether to display a progress bar
**kwargs : dict, default {'lam': np.logspace(-3, 3, 11)}
**kwargs : dict, default `lam=np.logspace(-3, 3, 11)`}
pairs of parameters and iterables of floats, or
parameters and iterables of iterables of floats.
if iterable of iterables of floats, the outer iterable must have
length m_features.
if grid is iterable of iterables of floats,
the outer iterable must have length m_features.
the cartesian product of the subgrids in the grid will be tested.
if grid is a 2d numpy array,
each row of the array will be tested.
the method will make a grid of all the combinations of the parameters
and fit a GAM to each combination.
Expand All @@ -1709,6 +1713,29 @@ def gridsearch(self, X, y, weights=None, return_scores=False,
objective scores as values
else:
self, ie possibly the newly fitted model
Examples
--------
For a model with 3 terms, and where we expect 3 lam values,
our search space for lam must have 3 dimensions.
However we can search the space in 2 ways:
- via cartesian product by specifying the grid as a list
our grid search will consider 11 ** 3 points
>>> lam = np.logspace(-3, 3, 11)
>>> lams = [lam] * 3
>>> gam.gridsearch(X, y, lam=lams)
- directly by specifying the grid as a np.ndarray
our gridsearch will consider 11 points
>>> lam = np.logspace(-3, 3, 11)
>>> lams = np.array([lam] * 3)
>>> gam.gridsearch(X, y, lam=lams)
the latter is useful for when the dimensionality of the search space
is very large, and we would prefer to execute a randomized search.
"""
# check if model fitted
if not self._is_fitted:
Expand Down Expand Up @@ -1758,21 +1785,38 @@ def gridsearch(self, X, y, weights=None, return_scores=False,
grids = []
for param, grid in list(param_grids.items()):

# check param exists
if param not in (admissible_params):
raise ValueError('unknown parameter: {}'.format(param))

# check grid is iterable at all
if not (isiterable(grid) and (len(grid) > 1)): \
raise ValueError('{} grid must either be iterable of '
'iterables, or an iterable of lengnth > 1, '\
'but found {}'.format(param, grid))

# prepare grid
if any(isiterable(g) for g in grid):
# cast to np.array

# get required parameter shape
target_len = len(flatten(getattr(self, param)))

# check if cartesian product needed
cartesian = (not isinstance(grid, np.ndarray) or grid.ndim != 2)

# build grid
grid = [np.atleast_1d(g) for g in grid]

# set grid to combination of all grids
grid = combine(*grid)
# check chape
msg = '{} grid should have {} columns, '\
'but found grid with {} columns'.format(param, target_len, len(grid))
if cartesian:
if len(grid) != target_len:
raise ValueError(msg)
grid = combine(*grid)

if not all([len(subgrid) == target_len for subgrid in grid]):
raise ValueError(msg)

# save param name and grid
params.append(param)
Expand Down Expand Up @@ -1852,8 +1896,26 @@ def gridsearch(self, X, y, weights=None, return_scores=False,
else:
return self

def randomsearch(self, X, y, weights=None, n=20, **param_grids):

if not self._is_fitted:
self._validate_params()
self._validate_data_dep_params(X)

# if no params, then set up default search
if not bool(param_grids):
lam = np.random.rand(n, len(flatten(self.lam))) * 6 - 3
lam = np.exp(lam)
param_grids['lam'] = lam

# TODO what should happen when we specify more than one parameter?
# how do we search the random space of all parameters

return self.gridsearch(X, y, weights=weights, **param_grids)


def sample(self, X, y, quantity='y', sample_at_X=None,
weights=None, n_draws=100, n_bootstraps=1, objective='auto'):
weights=None, n_draws=100, n_bootstraps=5, objective='auto'):
"""Simulate from the posterior of the coefficients and smoothing params.
Samples are drawn from the posterior of the coefficients and smoothing
Expand Down Expand Up @@ -1882,11 +1944,6 @@ def sample(self, X, y, quantity='y', sample_at_X=None,
`n_bootstraps` small. Make `n_bootstraps < n_draws` to take advantage
of the expensive bootstrap samples of the smoothing parameters.
NOTE: For now, the grid of `lam` values is the default of `gridsearch`.
Until randomized grid search is implemented, it is not worth setting
`n_bootstraps` to a value greater than one because the smoothing
parameters will be identical in each bootstrap sample.
Parameters
-----------
X : array of shape (n_samples, m_features)
Expand Down Expand Up @@ -1915,7 +1972,7 @@ def sample(self, X, y, quantity='y', sample_at_X=None,
The number of samples to draw from the posterior distribution of
the coefficients and smoothing parameters
n_bootstraps : positive int, default: 1
n_bootstraps : positive int, default: 5
The number of bootstrap samples to draw from simulations of the
response (from the already fitted model) to estimate the
distribution of the smoothing parameters given the response data.
Expand Down Expand Up @@ -1955,6 +2012,7 @@ def sample(self, X, y, quantity='y', sample_at_X=None,
coef_draws = self._sample_coef(
X, y, weights=weights, n_draws=n_draws,
n_bootstraps=n_bootstraps, objective=objective)

if quantity == 'coef':
return coef_draws

Expand All @@ -1977,8 +2035,6 @@ def _sample_coef(self, X, y, weights=None, n_draws=100, n_bootstraps=1,
`n_bootstraps` small. Make `n_bootstraps < n_draws` to take advantage
of the expensive bootstrap samples of the smoothing parameters.
For now, the grid of `lam` values is the default of `gridsearch`.
Parameters
-----------
X : array of shape (n_samples, m_features)
Expand Down Expand Up @@ -2039,7 +2095,14 @@ def _sample_coef(self, X, y, weights=None, n_draws=100, n_bootstraps=1,

def _bootstrap_samples_of_smoothing(self, X, y, weights=None,
n_bootstraps=1, objective='auto'):
"""Sample the smoothing parameters using simulated response data."""
"""Sample the smoothing parameters using simulated response data.
For now, the grid of `lam` values is 11 random points in M-dimensional
space, where M = the number of lam values, ie len(flatten(gam.lam))
all values are in [1e-3, 1e3]
"""
mu = self.predict_mu(X) # Wood pg. 198 step 1
coef_bootstraps = [self.coef_]
cov_bootstraps = [
Expand All @@ -2059,7 +2122,12 @@ def _bootstrap_samples_of_smoothing(self, X, y, weights=None,
# `n_bootstraps > 1`.
gam = deepcopy(self)
gam.set_params(self.get_params())
gam.gridsearch(X, y_bootstrap, weights=weights,

# create a random search of 11 points in lam space
# with all values in [1e-3, 1e3]
lam_grid = np.random.randn(11, len(flatten(self.lam))) * 6 - 3
lam_grid = np.exp(lam_grid)
gam.gridsearch(X, y_bootstrap, weights=weights, lam=lam_grid,
objective=objective)
lam = gam.lam

Expand Down
2 changes: 1 addition & 1 deletion pygam/terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def __init__(self, verbose=False):
self._name = 'intercept_term'
self._minimal_name = 'intercept'

super(Intercept, self).__init__(feature=None, fit_linear=False, fit_splines=False, lam=0, penalties=None, constraints=None, verbose=verbose)
super(Intercept, self).__init__(feature=None, fit_linear=False, fit_splines=False, lam=None, penalties=None, constraints=None, verbose=verbose)

self._exclude += ['fit_splines', 'fit_linear', 'lam', 'penalties', 'constraints', 'feature', 'dtype']
self._args = []
Expand Down
40 changes: 39 additions & 1 deletion pygam/tests/test_gridsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def test_gridsearch_all_dimensions_independent(cake_X_y):
"""
check that gridsearch searches all dimensions of lambda independently
"""
n = 3
n = 4
X, y = cake_X_y
m = X.shape[1]

Expand All @@ -88,6 +88,44 @@ def test_gridsearch_all_dimensions_independent(cake_X_y):
assert(len(scores) == n**m)
assert(m > 1)

def test_no_cartesian_product(cake_X_y):
"""
check that gridsearch does not do a cartesian product when a 2D numpy array is
passed as the grid and the number of columns matches the len of the parameter
"""
n = 5
X, y = cake_X_y
m = X.shape[1]

lams = np.array([np.logspace(-3,3, n)]*m).T
assert lams.shape == (n, m)

scores = LinearGAM().gridsearch(X, y,
lam=lams,
return_scores=True)

assert(len(scores) == n)
assert(m > 1)

def test_wrong_grid_shape(cake_X_y):
"""
check that gridsearch raises a ValueError when the grid shape cannot be interpretted
"""
X, y = cake_X_y
lams = np.random.rand(50, X.shape[1] + 1)

with pytest.raises(ValueError):
scores = LinearGAM().gridsearch(X, y,
lam=lams,
return_scores=True)

lams = lams.T.tolist()
assert len(lams) == X.shape[1] + 1
with pytest.raises(ValueError):
scores = LinearGAM().gridsearch(X, y,
lam=lams,
return_scores=True)

def test_GCV_objective_is_for_unknown_scale(mcycle_X_y, default_X_y, coal_X_y, trees_X_y):
"""
check that we use the GCV objective only for models with unknown scale
Expand Down

0 comments on commit 58df6e9

Please sign in to comment.