Address comments; about to split into two PRs

Quantco · AlanTuQC · May 13, 2022 · May 17, 2022 · May 18, 2022 · May 18, 2022
commit c5f566774bd53bf05f229c957a0745b409039dda
diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx
@@ -117,9 +117,6 @@ def enet_coordinate_descent_gram(int[::1] active_set,
     """
 
     cdef unsigned int n_active_features = active_set.shape[0]
-    cdef unsigned int n_features = Q.shape[0]  # are you sure this is correct?
-    # n_features is exactly now the same as n_active_features; should it be q.shape[0]?
-    # however, the variable is never used again, so I guess we are okay
 
     cdef floating w_ii
     cdef floating P1_ii
@@ -141,7 +138,7 @@ def enet_coordinate_descent_gram(int[::1] active_set,
         for n_iter in range(max_iter):
             w_max = 0.0
             d_w_max = 0.0
-            for f_iter in range(n_active_features):
+            for f_iter in range(n_active_features):  # Loop over coordinates
                 if random:
                     active_set_ii = rand_int(n_active_features, rand_r_state)
                 else:
@@ -227,8 +224,8 @@ def enet_coordinate_descent_gram_diag_fisher(
         q = X^T y
 
         This version, for diag_fisher=True, never calculates the entire Hessian matrix; 
-        only rows, when necessary. The inefficiency here is that we must re-calculate 
-        the rows of Q for every iteration up to max_iter.
+        only rows, when necessary. This requires less memory. However, the inefficiency 
+        is that we must re-calculate the rows of Q for every iteration up to max_iter.
     """
 
     cdef unsigned int n_active_features = active_set.shape[0]
@@ -260,14 +257,13 @@ def enet_coordinate_descent_gram_diag_fisher(
     for n_iter in range(max_iter):
         w_max = 0.0
         d_w_max = 0.0
-        for f_iter in range(n_active_features):
+        for f_iter in range(n_active_features):  # Loop over coordinates
             if random:
                 active_set_ii = rand_int(n_active_features, rand_r_state)
             else:
                 active_set_ii = f_iter
             ii = active_set[active_set_ii]
 
-            # Check logic here
             if active_set[0] < <unsigned int>intercept:  
                 if ii == 0:
                     Q_active_set_ii[0] = hessian_rows.sum()
@@ -335,7 +331,7 @@ def enet_coordinate_descent_gram_diag_fisher(
                             "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol),
                             ConvergenceWarning)
 
-    return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 # , Q_check
+    return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1
 
 
 cdef void cython_norm_min_subgrad(

@@ -672,7 +672,8 @@ def __init__(
         A_ineq: Optional[np.ndarray] = None,
         b_ineq: Optional[np.ndarray] = None,
         force_all_finite: bool = True,
-        diag_fisher=False,
+        use_sparse_hessian: bool = True,
+        diag_fisher: bool = False,
     ):
         self.l1_ratio = l1_ratio
         self.P1 = P1
@@ -702,6 +703,7 @@ def __init__(
         self.A_ineq = A_ineq
         self.b_ineq = b_ineq
         self.force_all_finite = force_all_finite
+        self.use_sparse_hessian = use_sparse_hessian
         self.diag_fisher = diag_fisher
 
     @property
@@ -971,6 +973,7 @@ def _solve(
                 lower_bounds=lower_bounds,
                 upper_bounds=upper_bounds,
                 verbose=self.verbose > 0,
+                use_sparse_hessian=self.use_sparse_hessian,
                 diag_fisher=self.diag_fisher,
             )
             if self._solver == "irls-ls":
@@ -979,7 +982,7 @@ def _solve(
                 )
             # 4.2 coordinate descent ##############################################
             elif self._solver == "irls-cd":
-                # [Alan] This is the case we're concerned with for wide problems.
+                # This is the case we're concerned with for wide problems.
                 coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
                     _cd_solver, coef, irls_data
                 )
@@ -2092,6 +2095,12 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         ``A_ineq w <= b_ineq``. Refer to the documentation of ``A_ineq`` for
         details.
 
+    use_sparse_hessian : boolean, optional, (default=True)
+        If ``True``, stores the current state of the Hessian as a sparse COO matrix.
+        If ``False``, stores as a dense matrix of size `num_cols` x `num_cols`, where
+        `num_cols` is the number of columns in the data X. Use ``True`` to avoid memory
+        issues when working with extremely wide data.
+
     diag_fisher : boolean, optional, (default=False)
         Only relevant for solver 'cd' (see also ``start_params='guess'``).
         If ``False``, the full Fisher matrix (expected Hessian) is computed in
@@ -2183,7 +2192,8 @@ def __init__(
         A_ineq: Optional[np.ndarray] = None,
         b_ineq: Optional[np.ndarray] = None,
         force_all_finite: bool = True,
-        diag_fisher=False,
+        use_sparse_hessian: bool = True,
+        diag_fisher: bool = False,
     ):
         self.alphas = alphas
         self.alpha = alpha
@@ -2216,6 +2226,7 @@ def __init__(
             A_ineq=A_ineq,
             b_ineq=b_ineq,
             force_all_finite=force_all_finite,
+            use_sparse_hessian=use_sparse_hessian,
             diag_fisher=diag_fisher,
         )
 

@@ -69,6 +69,7 @@ def _cd_solver(state, data, active_hessian):
         else:
             # with no intercept, active_set here can be massive on first iteration
             X_active = data.X[:, state.active_set]
+
         (new_coef, gap, _, _, n_cycles,) = enet_coordinate_descent_gram_diag_fisher(
             X_active,
             state.hessian_rows,
@@ -103,6 +104,7 @@ def _cd_solver(state, data, active_hessian):
             data.has_upper_bounds,
             data._upper_bounds,
         )
+
     return new_coef - state.coef, n_cycles
 
 
@@ -184,10 +186,10 @@ def update_hessian(state, data, active_set):
         )
 
         # In the sparse Hessian case, state.hessian is a coo_matrix.
-        # hessian_init is a numpy array of size active_set x active_set;
+        # hessian_init is np array of size active_set.shape[0] x active_set.shape[0];
         # we can convert this to a coo_matrix and simply add it to the state.hessian
-        if state.use_sparse_hessian:
-            coo_data = hessian_init.flatten()
+        if data.use_sparse_hessian:
+            coo_data = hessian_init.ravel(order="C")
             coo_rows = np.repeat(active_set, active_set.shape[0])
             coo_cols = np.tile(active_set, active_set.shape[0])
 
@@ -225,8 +227,8 @@ def update_hessian(state, data, active_set):
             active_cols=active_set,
         )
 
-        if state.use_sparse_hessian:
-            coo_data = hessian_delta.flatten()
+        if data.use_sparse_hessian:
+            coo_data = hessian_delta.ravel(order="C")
             coo_rows = np.repeat(active_set, active_set.shape[0])
             coo_cols = np.tile(active_set, active_set.shape[0])
 
@@ -241,9 +243,9 @@ def update_hessian(state, data, active_set):
 
     # If state.hessian is in COO form, we have to convert to CSC in order to index the
     # active set elements, which we then convert to a numpy array for compatibility with
-    # the rest of the code
-    if state.use_sparse_hessian:
-        # this is a little awkward
+    # the rest of the code. This numpy array is dense, but we care about problems in
+    # which the active set is usually much smaller than the number of data columns.
+    if data.use_sparse_hessian:
         return (
             state.hessian.tocsc()[np.ix_(active_set, active_set)].toarray(),
             n_active_rows,
@@ -332,7 +334,6 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L
 
     Repeat steps 1-3 until convergence.
     Note: Use hessian matrix instead of Hessian for H.
-        This used to say "Use Fisher matrix instead of Hessian for H."
     Note: f' = -score, H = hessian matrix
 
     Parameters
@@ -508,6 +509,7 @@ def __init__(
         lower_bounds: Optional[np.ndarray] = None,
         upper_bounds: Optional[np.ndarray] = None,
         verbose: bool = False,
+        use_sparse_hessian: bool = True,
         diag_fisher: bool = False,
     ):
         self.X = X
@@ -540,6 +542,7 @@ def __init__(
 
         self.intercept_offset = 1 if self.fit_intercept else 0
         self.verbose = verbose
+        self.use_sparse_hessian = use_sparse_hessian
         self.diag_fisher = diag_fisher
 
         self._check_data()
@@ -606,8 +609,7 @@ def __init__(self, coef, data):
 
         # In order to avoid memory issues for very wide datasets (e.g. Issue #485), we
         # can instantiate the Hessian as a sparse COO matrix.
-        self.use_sparse_hessian = True
-        if self.use_sparse_hessian:
+        if data.use_sparse_hessian:
             self.hessian = sparse.coo_matrix(
                 (self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
             )