From b9c95df2d52e90ecf6bc3d8cc7c6dc65ba5797f2 Mon Sep 17 00:00:00 2001
From: dswah <dserven@gmail.com>
Date: Fri, 14 Sep 2018 02:15:49 +0200
Subject: [PATCH 1/4] gridsearch allows no cartesian product, sampling exploits
 this to do 5 bootstraps

---
 pygam/pygam.py                 | 104 +++++++++++++++++++++++++++------
 pygam/tests/test_gridsearch.py |  35 +++++++++++
 2 files changed, 121 insertions(+), 18 deletions(-)

diff --git a/pygam/pygam.py b/pygam/pygam.py
index c2f18625..ee332c50 100644
--- a/pygam/pygam.py
+++ b/pygam/pygam.py
@@ -1690,12 +1690,16 @@ def gridsearch(self, X, y, weights=None, return_scores=False,
         progress : bool, default: True
             whether to display a progress bar
 
-        **kwargs : dict, default {'lam': np.logspace(-3, 3, 11)}
+        **kwargs : dict, default `lam=np.logspace(-3, 3, 11)`}
             pairs of parameters and iterables of floats, or
             parameters and iterables of iterables of floats.
 
-            if iterable of iterables of floats, the outer iterable must have
-            length m_features.
+            if grid is iterable of iterables of floats,
+            the outer iterable must have length m_features.
+            the cartesian product of the subgrids in the grid will be tested.
+
+            if grid is a 2d numpy array,
+            each row of the array will be tested.
 
             the method will make a grid of all the combinations of the parameters
             and fit a GAM to each combination.
@@ -1709,6 +1713,29 @@ def gridsearch(self, X, y, weights=None, return_scores=False,
                 objective scores as values
         else:
             self, ie possibly the newly fitted model
+
+        Examples
+        --------
+        For a model with 3 terms, and where we expect 3 lam values,
+        our search space for lam must have 3 dimensions.
+
+        However we can search the space in 2 ways:
+        - via cartesian product by specifying the grid as a list
+        our grid search will consider 11 ** 3 points
+
+        >>> lam = np.logspace(-3, 3, 11)
+        >>> lams = [lam] * 3
+        >>> gam.gridsearch(X, y, lam=lams)
+
+        - directly by specifying the grid as a np.ndarray
+        our gridsearch will consider 11 points
+
+        >>> lam = np.logspace(-3, 3, 11)
+        >>> lams = np.array([lam] * 3)
+        >>> gam.gridsearch(X, y, lam=lams)
+
+        the latter is useful for when the dimensionality of the search space
+        is very large, and we would prefer to execute a randomized search.
         """
         # check if model fitted
         if not self._is_fitted:
@@ -1768,11 +1795,28 @@ def gridsearch(self, X, y, weights=None, return_scores=False,
 
             # prepare grid
             if any(isiterable(g) for g in grid):
-                # cast to np.array
-                grid = [np.atleast_1d(g) for g in grid]
 
-                # set grid to combination of all grids
-                grid = combine(*grid)
+                target_len = len(flatten(getattr(self, param)))
+
+                # check if no cartesian product needed
+                if isinstance(grid, np.ndarray) and grid.ndim == 2:
+                    if grid.shape[1] == target_len:
+                        # no cartesian product needed
+                        grid = grid.T.tolist()
+                    else:
+                        raise ValueError('{} grid should have {} columns, '\
+                                         'but found grid with shape {}'.format(param, target_len, grid.shape))
+
+                # do cartesian product
+                else:
+                    # cast to np.array
+                    grid = [np.atleast_1d(g) for g in grid]
+                    if len(grid) == target_len:
+                        # cartesian product of subgrids
+                        grid = combine(*grid)
+                    else:
+                        raise ValueError('{} grid should have length {}, '\
+                                         'but found grid of length {}'.format(param, target_len, len(grid)))
 
             # save param name and grid
             params.append(param)
@@ -1852,8 +1896,26 @@ def gridsearch(self, X, y, weights=None, return_scores=False,
         else:
             return self
 
+    def randomsearch(self, X, y, weights=None, n=20, **param_grids):
+
+        if not self._is_fitted:
+            self._validate_params()
+            self._validate_data_dep_params(X)
+
+        # if no params, then set up default search
+        if not bool(param_grids):
+            lam = np.random.rand(n, len(flatten(self.lam))) * 6 - 3
+            lam = np.exp(lam)
+            param_grids['lam'] = lam
+
+        # TODO what should happen when we specify more than one parameter?
+        # how do we search the random space of all parameters
+
+        return self.gridsearch(X, y, weights=weights, **param_grids)
+
+
     def sample(self, X, y, quantity='y', sample_at_X=None,
-               weights=None, n_draws=100, n_bootstraps=1, objective='auto'):
+               weights=None, n_draws=100, n_bootstraps=5, objective='auto'):
         """Simulate from the posterior of the coefficients and smoothing params.
 
         Samples are drawn from the posterior of the coefficients and smoothing
@@ -1882,11 +1944,6 @@ def sample(self, X, y, quantity='y', sample_at_X=None,
         `n_bootstraps` small. Make `n_bootstraps < n_draws` to take advantage
         of the expensive bootstrap samples of the smoothing parameters.
 
-        NOTE: For now, the grid of `lam` values is the default of `gridsearch`.
-        Until randomized grid search is implemented, it is not worth setting
-        `n_bootstraps` to a value greater than one because the smoothing
-        parameters will be identical in each bootstrap sample.
-
         Parameters
         -----------
         X : array of shape (n_samples, m_features)
@@ -1915,7 +1972,7 @@ def sample(self, X, y, quantity='y', sample_at_X=None,
             The number of samples to draw from the posterior distribution of
             the coefficients and smoothing parameters
 
-        n_bootstraps : positive int, default: 1
+        n_bootstraps : positive int, default: 5
             The number of bootstrap samples to draw from simulations of the
             response (from the already fitted model) to estimate the
             distribution of the smoothing parameters given the response data.
@@ -1955,6 +2012,7 @@ def sample(self, X, y, quantity='y', sample_at_X=None,
         coef_draws = self._sample_coef(
             X, y, weights=weights, n_draws=n_draws,
             n_bootstraps=n_bootstraps, objective=objective)
+
         if quantity == 'coef':
             return coef_draws
 
@@ -1977,8 +2035,6 @@ def _sample_coef(self, X, y, weights=None, n_draws=100, n_bootstraps=1,
         `n_bootstraps` small. Make `n_bootstraps < n_draws` to take advantage
         of the expensive bootstrap samples of the smoothing parameters.
 
-        For now, the grid of `lam` values is the default of `gridsearch`.
-
         Parameters
         -----------
         X : array of shape (n_samples, m_features)
@@ -2039,7 +2095,14 @@ def _sample_coef(self, X, y, weights=None, n_draws=100, n_bootstraps=1,
 
     def _bootstrap_samples_of_smoothing(self, X, y, weights=None,
                                         n_bootstraps=1, objective='auto'):
-        """Sample the smoothing parameters using simulated response data."""
+        """Sample the smoothing parameters using simulated response data.
+
+
+        For now, the grid of `lam` values is 11 random points in M-dimensional
+        space, where M = the number of lam values, ie len(flatten(gam.lam))
+
+        all values are in [1e-3, 1e3]
+        """
         mu = self.predict_mu(X)  # Wood pg. 198 step 1
         coef_bootstraps = [self.coef_]
         cov_bootstraps = [
@@ -2059,7 +2122,12 @@ def _bootstrap_samples_of_smoothing(self, X, y, weights=None,
             # `n_bootstraps > 1`.
             gam = deepcopy(self)
             gam.set_params(self.get_params())
-            gam.gridsearch(X, y_bootstrap, weights=weights,
+
+            # create a random search of 11 points in lam space
+            # with all values in [1e-3, 1e3]
+            lam_grid = np.random.randn(11, len(flatten(self.lam))) * 6 - 3
+            lam_grid = np.exp(lam_grid)
+            gam.gridsearch(X, y_bootstrap, weights=weights, lam=lam_grid,
                            objective=objective)
             lam = gam.lam
 
diff --git a/pygam/tests/test_gridsearch.py b/pygam/tests/test_gridsearch.py
index d6e24c93..9bb6a97e 100644
--- a/pygam/tests/test_gridsearch.py
+++ b/pygam/tests/test_gridsearch.py
@@ -88,6 +88,41 @@ def test_gridsearch_all_dimensions_independent(cake_X_y):
     assert(len(scores) == n**m)
     assert(m > 1)
 
+def test_no_cartesian_product(cake_X_y):
+    """
+    check that gridsearch does not do a cartesian product when a 2D numpy array is
+    passed as the grid and the number of columns matches the len of the parameter
+    """
+    n = 3
+    X, y = cake_X_y
+    m = X.shape[1]
+
+    scores = LinearGAM().gridsearch(X, y,
+                                    lam=np.array([np.logspace(-3,3, n)]*m),
+                                    return_scores=True)
+
+    assert(len(scores) == n)
+    assert(m > 1)
+
+def test_wrong_grid_shape(cake_X_y):
+    """
+    check that gridsearch raises a ValueError when the grid shape cannot be interpretted
+    """
+    X, y = cake_X_y
+    lams = np.random.rand(50, X.shape[1] + 1)
+
+    with pytest.raises(ValueError):
+        scores = LinearGAM().gridsearch(X, y,
+                                        lam=lams,
+                                        return_scores=True)
+
+    lams = lams.T.tolist()
+    assert len(lams) == X.shape[1] + 1
+    with pytest.raises(ValueError):
+        scores = LinearGAM().gridsearch(X, y,
+                                        lam=lams,
+                                        return_scores=True)
+
 def test_GCV_objective_is_for_unknown_scale(mcycle_X_y, default_X_y, coal_X_y, trees_X_y):
     """
     check that we use the GCV objective only for models with unknown scale

From c996fb3b6631e7719526fe17ee71309a604414ef Mon Sep 17 00:00:00 2001
From: dswah <dserven@gmail.com>
Date: Fri, 14 Sep 2018 17:16:12 +0200
Subject: [PATCH 2/4] fix bug in grid shapes

---
 pygam/pygam.py                 | 38 +++++++++++++++++-----------------
 pygam/tests/test_gridsearch.py |  9 +++++---
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/pygam/pygam.py b/pygam/pygam.py
index ee332c50..158899e5 100644
--- a/pygam/pygam.py
+++ b/pygam/pygam.py
@@ -1785,9 +1785,11 @@ def gridsearch(self, X, y, weights=None, return_scores=False,
         grids = []
         for param, grid in list(param_grids.items()):
 
+            # check param exists
             if param not in (admissible_params):
                 raise ValueError('unknown parameter: {}'.format(param))
 
+            # check grid is iterable at all
             if not (isiterable(grid) and (len(grid) > 1)): \
                 raise ValueError('{} grid must either be iterable of '
                                  'iterables, or an iterable of lengnth > 1, '\
@@ -1796,27 +1798,25 @@ def gridsearch(self, X, y, weights=None, return_scores=False,
             # prepare grid
             if any(isiterable(g) for g in grid):
 
+                # get required parameter shape
                 target_len = len(flatten(getattr(self, param)))
 
-                # check if no cartesian product needed
-                if isinstance(grid, np.ndarray) and grid.ndim == 2:
-                    if grid.shape[1] == target_len:
-                        # no cartesian product needed
-                        grid = grid.T.tolist()
-                    else:
-                        raise ValueError('{} grid should have {} columns, '\
-                                         'but found grid with shape {}'.format(param, target_len, grid.shape))
-
-                # do cartesian product
-                else:
-                    # cast to np.array
-                    grid = [np.atleast_1d(g) for g in grid]
-                    if len(grid) == target_len:
-                        # cartesian product of subgrids
-                        grid = combine(*grid)
-                    else:
-                        raise ValueError('{} grid should have length {}, '\
-                                         'but found grid of length {}'.format(param, target_len, len(grid)))
+                # check if cartesian product needed
+                cartesian = (not isinstance(grid, np.ndarray) or grid.ndim != 2)
+
+                # build grid
+                grid = [np.atleast_1d(g) for g in grid]
+
+                # check chape
+                msg = '{} grid should have {} columns, '\
+                      'but found grid with {} columns'.format(param, target_len, len(grid))
+                if cartesian:
+                    if len(grid) != target_len:
+                        raise ValueError(msg)
+                    grid = combine(*grid)
+
+                if not all([len(subgrid) == target_len for subgrid in grid]):
+                    raise ValueError(msg)
 
             # save param name and grid
             params.append(param)
diff --git a/pygam/tests/test_gridsearch.py b/pygam/tests/test_gridsearch.py
index 9bb6a97e..c638f3ad 100644
--- a/pygam/tests/test_gridsearch.py
+++ b/pygam/tests/test_gridsearch.py
@@ -77,7 +77,7 @@ def test_gridsearch_all_dimensions_independent(cake_X_y):
     """
     check that gridsearch searches all dimensions of lambda independently
     """
-    n = 3
+    n = 4
     X, y = cake_X_y
     m = X.shape[1]
 
@@ -93,12 +93,15 @@ def test_no_cartesian_product(cake_X_y):
     check that gridsearch does not do a cartesian product when a 2D numpy array is
     passed as the grid and the number of columns matches the len of the parameter
     """
-    n = 3
+    n = 5
     X, y = cake_X_y
     m = X.shape[1]
 
+    lams = np.array([np.logspace(-3,3, n)]*m).T
+    assert lams.shape == (n, m)
+
     scores = LinearGAM().gridsearch(X, y,
-                                    lam=np.array([np.logspace(-3,3, n)]*m),
+                                    lam=lams,
                                     return_scores=True)
 
     assert(len(scores) == n)

From 73df735256256058e800b13a58a7df6900087c09 Mon Sep 17 00:00:00 2001
From: dswah <dserven@gmail.com>
Date: Fri, 14 Sep 2018 17:16:56 +0200
Subject: [PATCH 3/4] intercept should have lam=None, summary prints blank for
 intercept lam

---
 pygam/pygam.py | 2 +-
 pygam/terms.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pygam/pygam.py b/pygam/pygam.py
index 158899e5..8bc372fd 100644
--- a/pygam/pygam.py
+++ b/pygam/pygam.py
@@ -1614,7 +1614,7 @@ def summary(self):
 
             term_data = {
                         'feature_func': repr(term),
-                        'lam': np.round(flatten(term.lam), 4),
+                        'lam': '' if term.isintercept else np.round(flatten(term.lam), 4),
                         'rank': '{}'.format(term.n_coefs),
                         'edof': '{}'.format(edof),
                         'p_value': '%.2e'%(self.statistics_['p_values'][i]),
diff --git a/pygam/terms.py b/pygam/terms.py
index 8945ebb7..9af00f01 100644
--- a/pygam/terms.py
+++ b/pygam/terms.py
@@ -416,7 +416,7 @@ def __init__(self, verbose=False):
         self._name = 'intercept_term'
         self._minimal_name = 'intercept'
 
-        super(Intercept, self).__init__(feature=None, fit_linear=False, fit_splines=False, lam=0, penalties=None, constraints=None, verbose=verbose)
+        super(Intercept, self).__init__(feature=None, fit_linear=False, fit_splines=False, lam=None, penalties=None, constraints=None, verbose=verbose)
 
         self._exclude += ['fit_splines', 'fit_linear', 'lam', 'penalties', 'constraints', 'feature', 'dtype']
         self._args = []

From cf390541100af8b53c5c7a8696f720f511a75036 Mon Sep 17 00:00:00 2001
From: dswah <dserven@gmail.com>
Date: Fri, 14 Sep 2018 17:17:06 +0200
Subject: [PATCH 4/4] bump patch number

---
 pygam/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pygam/__init__.py b/pygam/__init__.py
index de02a23e..3cea2f93 100644
--- a/pygam/__init__.py
+++ b/pygam/__init__.py
@@ -21,4 +21,4 @@
 __all__ = ['GAM', 'LinearGAM', 'LogisticGAM', 'GammaGAM', 'PoissonGAM',
            'InvGaussGAM', 'ExpectileGAM', 'l', 's', 'f', 'te', 'intercept']
 
-__version__ = '0.6.1'
+__version__ = '0.6.2'