Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add use_sparse_hessian functionality to prevent a dense Hessian from being instantiated #543

Closed
wants to merge 9 commits into from
Prev Previous commit
Next Next commit
Address comments; about to split into two PRs
  • Loading branch information
AlanTuQC committed May 19, 2022
commit c5f566774bd53bf05f229c957a0745b409039dda
14 changes: 5 additions & 9 deletions src/glum/_cd_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,6 @@ def enet_coordinate_descent_gram(int[::1] active_set,
"""

cdef unsigned int n_active_features = active_set.shape[0]
cdef unsigned int n_features = Q.shape[0] # are you sure this is correct?
# n_features is exactly now the same as n_active_features; should it be q.shape[0]?
# however, the variable is never used again, so I guess we are okay

cdef floating w_ii
cdef floating P1_ii
Expand All @@ -141,7 +138,7 @@ def enet_coordinate_descent_gram(int[::1] active_set,
for n_iter in range(max_iter):
w_max = 0.0
d_w_max = 0.0
for f_iter in range(n_active_features):
for f_iter in range(n_active_features): # Loop over coordinates
if random:
active_set_ii = rand_int(n_active_features, rand_r_state)
else:
Expand Down Expand Up @@ -227,8 +224,8 @@ def enet_coordinate_descent_gram_diag_fisher(
q = X^T y

This version, for diag_fisher=True, never calculates the entire Hessian matrix;
only rows, when necessary. The inefficiency here is that we must re-calculate
the rows of Q for every iteration up to max_iter.
only rows, when necessary. This requires less memory. However, the inefficiency
is that we must re-calculate the rows of Q for every iteration up to max_iter.
"""

cdef unsigned int n_active_features = active_set.shape[0]
Expand Down Expand Up @@ -260,14 +257,13 @@ def enet_coordinate_descent_gram_diag_fisher(
for n_iter in range(max_iter):
w_max = 0.0
d_w_max = 0.0
for f_iter in range(n_active_features):
for f_iter in range(n_active_features): # Loop over coordinates
if random:
active_set_ii = rand_int(n_active_features, rand_r_state)
else:
active_set_ii = f_iter
ii = active_set[active_set_ii]

# Check logic here
if active_set[0] < <unsigned int>intercept:
if ii == 0:
Q_active_set_ii[0] = hessian_rows.sum()
Expand Down Expand Up @@ -335,7 +331,7 @@ def enet_coordinate_descent_gram_diag_fisher(
"subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol),
ConvergenceWarning)

return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 # , Q_check
return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1


cdef void cython_norm_min_subgrad(
Expand Down
17 changes: 14 additions & 3 deletions src/glum/_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,8 @@ def __init__(
A_ineq: Optional[np.ndarray] = None,
b_ineq: Optional[np.ndarray] = None,
force_all_finite: bool = True,
diag_fisher=False,
use_sparse_hessian: bool = True,
diag_fisher: bool = False,
):
self.l1_ratio = l1_ratio
self.P1 = P1
Expand Down Expand Up @@ -702,6 +703,7 @@ def __init__(
self.A_ineq = A_ineq
self.b_ineq = b_ineq
self.force_all_finite = force_all_finite
self.use_sparse_hessian = use_sparse_hessian
self.diag_fisher = diag_fisher

@property
Expand Down Expand Up @@ -971,6 +973,7 @@ def _solve(
lower_bounds=lower_bounds,
upper_bounds=upper_bounds,
verbose=self.verbose > 0,
use_sparse_hessian=self.use_sparse_hessian,
diag_fisher=self.diag_fisher,
)
if self._solver == "irls-ls":
Expand All @@ -979,7 +982,7 @@ def _solve(
)
# 4.2 coordinate descent ##############################################
elif self._solver == "irls-cd":
# [Alan] This is the case we're concerned with for wide problems.
# This is the case we're concerned with for wide problems.
coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
_cd_solver, coef, irls_data
)
Expand Down Expand Up @@ -2092,6 +2095,12 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
``A_ineq w <= b_ineq``. Refer to the documentation of ``A_ineq`` for
details.

use_sparse_hessian : boolean, optional, (default=True)
If ``True``, stores the current state of the Hessian as a sparse COO matrix.
If ``False``, stores as a dense matrix of size `num_cols` x `num_cols`, where
`num_cols` is the number of columns in the data X. Use ``True`` to avoid memory
issues when working with extremely wide data.

diag_fisher : boolean, optional, (default=False)
Only relevant for solver 'cd' (see also ``start_params='guess'``).
If ``False``, the full Fisher matrix (expected Hessian) is computed in
Expand Down Expand Up @@ -2183,7 +2192,8 @@ def __init__(
A_ineq: Optional[np.ndarray] = None,
b_ineq: Optional[np.ndarray] = None,
force_all_finite: bool = True,
diag_fisher=False,
use_sparse_hessian: bool = True,
diag_fisher: bool = False,
):
self.alphas = alphas
self.alpha = alpha
Expand Down Expand Up @@ -2216,6 +2226,7 @@ def __init__(
A_ineq=A_ineq,
b_ineq=b_ineq,
force_all_finite=force_all_finite,
use_sparse_hessian=use_sparse_hessian,
diag_fisher=diag_fisher,
)

Expand Down
24 changes: 13 additions & 11 deletions src/glum/_solvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def _cd_solver(state, data, active_hessian):
else:
# with no intercept, active_set here can be massive on first iteration
X_active = data.X[:, state.active_set]

(new_coef, gap, _, _, n_cycles,) = enet_coordinate_descent_gram_diag_fisher(
X_active,
state.hessian_rows,
Expand Down Expand Up @@ -103,6 +104,7 @@ def _cd_solver(state, data, active_hessian):
data.has_upper_bounds,
data._upper_bounds,
)

return new_coef - state.coef, n_cycles


Expand Down Expand Up @@ -184,10 +186,10 @@ def update_hessian(state, data, active_set):
)

# In the sparse Hessian case, state.hessian is a coo_matrix.
# hessian_init is a numpy array of size active_set x active_set;
# hessian_init is np array of size active_set.shape[0] x active_set.shape[0];
# we can convert this to a coo_matrix and simply add it to the state.hessian
if state.use_sparse_hessian:
coo_data = hessian_init.flatten()
if data.use_sparse_hessian:
coo_data = hessian_init.ravel(order="C")
coo_rows = np.repeat(active_set, active_set.shape[0])
coo_cols = np.tile(active_set, active_set.shape[0])

Expand Down Expand Up @@ -225,8 +227,8 @@ def update_hessian(state, data, active_set):
active_cols=active_set,
)

if state.use_sparse_hessian:
coo_data = hessian_delta.flatten()
if data.use_sparse_hessian:
coo_data = hessian_delta.ravel(order="C")
coo_rows = np.repeat(active_set, active_set.shape[0])
coo_cols = np.tile(active_set, active_set.shape[0])

Expand All @@ -241,9 +243,9 @@ def update_hessian(state, data, active_set):

# If state.hessian is in COO form, we have to convert to CSC in order to index the
# active set elements, which we then convert to a numpy array for compatibility with
# the rest of the code
if state.use_sparse_hessian:
# this is a little awkward
# the rest of the code. This numpy array is dense, but we care about problems in
# which the active set is usually much smaller than the number of data columns.
if data.use_sparse_hessian:
return (
state.hessian.tocsc()[np.ix_(active_set, active_set)].toarray(),
n_active_rows,
Expand Down Expand Up @@ -332,7 +334,6 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L

Repeat steps 1-3 until convergence.
Note: Use hessian matrix instead of Hessian for H.
This used to say "Use Fisher matrix instead of Hessian for H."
Note: f' = -score, H = hessian matrix

Parameters
Expand Down Expand Up @@ -508,6 +509,7 @@ def __init__(
lower_bounds: Optional[np.ndarray] = None,
upper_bounds: Optional[np.ndarray] = None,
verbose: bool = False,
use_sparse_hessian: bool = True,
diag_fisher: bool = False,
):
self.X = X
Expand Down Expand Up @@ -540,6 +542,7 @@ def __init__(

self.intercept_offset = 1 if self.fit_intercept else 0
self.verbose = verbose
self.use_sparse_hessian = use_sparse_hessian
self.diag_fisher = diag_fisher

self._check_data()
Expand Down Expand Up @@ -606,8 +609,7 @@ def __init__(self, coef, data):

# In order to avoid memory issues for very wide datasets (e.g. Issue #485), we
# can instantiate the Hessian as a sparse COO matrix.
self.use_sparse_hessian = True
if self.use_sparse_hessian:
if data.use_sparse_hessian:
self.hessian = sparse.coo_matrix(
(self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
)
Expand Down