From 0868b0f17c6cb593de3d1c4fb59b9b6785601015 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Wed, 12 Oct 2022 20:42:31 +0200 Subject: [PATCH 01/29] POC FISTA --- skglm/solvers/__init__.py | 3 +- skglm/solvers/fista.py | 61 +++++++++++++++++++++++++++++++++++++++ toy_fista.py | 27 +++++++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 skglm/solvers/fista.py create mode 100644 toy_fista.py diff --git a/skglm/solvers/__init__.py b/skglm/solvers/__init__.py index 0f8016f40..d86eb275a 100644 --- a/skglm/solvers/__init__.py +++ b/skglm/solvers/__init__.py @@ -1,9 +1,10 @@ from .anderson_cd import AndersonCD from .base import BaseSolver +from .fista import FISTA from .gram_cd import GramCD from .group_bcd import GroupBCD from .multitask_bcd import MultiTaskBCD from .prox_newton import ProxNewton -__all__ = [AndersonCD, BaseSolver, GramCD, GroupBCD, MultiTaskBCD, ProxNewton] +__all__ = [AndersonCD, BaseSolver, FISTA, GramCD, GroupBCD, MultiTaskBCD, ProxNewton] diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py new file mode 100644 index 000000000..3aff84d0f --- /dev/null +++ b/skglm/solvers/fista.py @@ -0,0 +1,61 @@ +import numpy as np +from numba import njit +from skglm.solvers.base import BaseSolver + + +class FISTA(BaseSolver): + r"""ISTA solver with Nesterov acceleration (FISTA).""" + + def __init__(self, max_iter=100, tol=1e-4, fit_intercept=False, warm_start=False, + opt_freq=50, verbose=0): + self.max_iter = max_iter + self.tol = tol + self.fit_intercept = fit_intercept + self.warm_start = warm_start + self.opt_freq = opt_freq + self.verbose = verbose + + def solve(self, X, y, penalty, w_init=None, weights=None): + # needs a quadratic datafit, but works with L1, WeightedL1, SLOPE + n_samples, n_features = X.shape + all_features = np.arange(n_features) + t_new = 1 + + w = w_init.copy() if w_init is not None else np.zeros(n_features) + z = w_init.copy() if w_init is not None else np.zeros(n_features) + weights = weights if weights is not None else np.ones(n_features) + + # FISTA with Gram update + G = X.T @ X + Xty = X.T @ y + lipschitz = np.linalg.norm(X, ord=2) ** 2 / n_samples + + for n_iter in range(self.max_iter): + t_old = t_new + t_new = (1 + np.sqrt(1 + 4 * t_old ** 2)) / 2 + w_old = w.copy() + grad = (G @ z - Xty) / n_samples + z -= grad / lipschitz + # TODO: TO DISCUSS! + # XXX: should add a full prox update + for j in range(n_features): + w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j) + z = w + (t_old - 1.) / t_new * (w - w_old) + + if n_iter % self.opt_freq == 0: + opt = penalty.subdiff_distance(w, grad, all_features) + stop_crit = np.max(opt) + + if self.verbose: + p_obj = (np.sum((y - X @ w) ** 2) / (2 * n_samples) + + penalty.alpha * penalty.value(w)) + print( + f"Iteration {n_iter+1}: {p_obj:.10f}, " + f"stopping crit: {stop_crit:.2e}" + ) + + if stop_crit < self.tol: + if self.verbose: + print(f"Stopping criterion max violation: {stop_crit:.2e}") + break + return w diff --git a/toy_fista.py b/toy_fista.py new file mode 100644 index 000000000..fb26f99be --- /dev/null +++ b/toy_fista.py @@ -0,0 +1,27 @@ +import numpy as np +from numpy.linalg import norm +from skglm.solvers import FISTA +from skglm.penalties import L1 +from skglm.estimators import Lasso +from skglm.utils import make_correlated_data, compiled_clone + + +X, y, _ = make_correlated_data(n_samples=200, n_features=100, random_state=24) + +n_samples, n_features = X.shape +alpha_max = norm(X.T @ y, ord=np.inf) / n_samples + +alpha = alpha_max / 10 + +max_iter = 1000 +obj_freq = 100 +tol = 1e-10 + +solver = FISTA(max_iter=max_iter, tol=tol, opt_freq=obj_freq, verbose=1) +penalty = compiled_clone(L1(alpha)) +w = solver.solve(X, y, penalty) + +clf = Lasso(alpha=alpha, tol=tol, fit_intercept=False) +clf.fit(X, y) + +np.testing.assert_allclose(w, clf.coef_, rtol=1e-5) From 8584299888f3f7fca22156503367c849d67ca8f7 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 14 Oct 2022 14:50:17 +0200 Subject: [PATCH 02/29] CLN --- skglm/solvers/fista.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index 3aff84d0f..30e449721 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -3,16 +3,25 @@ from skglm.solvers.base import BaseSolver +@njit +def _prox_vec(w, z, penalty, lipschitz): + # XXX: TO DISCUSS: should add a vectorized prox update + n_features = w.shape[0] + for j in range(n_features): + w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j) + return w + + class FISTA(BaseSolver): r"""ISTA solver with Nesterov acceleration (FISTA).""" def __init__(self, max_iter=100, tol=1e-4, fit_intercept=False, warm_start=False, - opt_freq=50, verbose=0): + opt_freq=100, verbose=0): self.max_iter = max_iter self.tol = tol self.fit_intercept = fit_intercept self.warm_start = warm_start - self.opt_freq = opt_freq + self.opt_freq = opt_freq self.verbose = verbose def solve(self, X, y, penalty, w_init=None, weights=None): @@ -36,10 +45,7 @@ def solve(self, X, y, penalty, w_init=None, weights=None): w_old = w.copy() grad = (G @ z - Xty) / n_samples z -= grad / lipschitz - # TODO: TO DISCUSS! - # XXX: should add a full prox update - for j in range(n_features): - w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j) + w = _prox_vec(w, z, penalty, lipschitz) z = w + (t_old - 1.) / t_new * (w - w_old) if n_iter % self.opt_freq == 0: @@ -48,7 +54,7 @@ def solve(self, X, y, penalty, w_init=None, weights=None): if self.verbose: p_obj = (np.sum((y - X @ w) ** 2) / (2 * n_samples) - + penalty.alpha * penalty.value(w)) + + penalty.value(w)) print( f"Iteration {n_iter+1}: {p_obj:.10f}, " f"stopping crit: {stop_crit:.2e}" From c82e32eb1b5af1d389d68d3dfad3a38cd8169378 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 14 Oct 2022 15:09:26 +0200 Subject: [PATCH 03/29] changed obj_freq from 100 to 10 --- skglm/solvers/fista.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index 30e449721..226ed4f89 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -5,7 +5,6 @@ @njit def _prox_vec(w, z, penalty, lipschitz): - # XXX: TO DISCUSS: should add a vectorized prox update n_features = w.shape[0] for j in range(n_features): w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j) @@ -16,7 +15,7 @@ class FISTA(BaseSolver): r"""ISTA solver with Nesterov acceleration (FISTA).""" def __init__(self, max_iter=100, tol=1e-4, fit_intercept=False, warm_start=False, - opt_freq=100, verbose=0): + opt_freq=10, verbose=0): self.max_iter = max_iter self.tol = tol self.fit_intercept = fit_intercept From 4940a0d2cff4208b73211b20f4c39b670b8a159a Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 14 Oct 2022 15:48:35 +0200 Subject: [PATCH 04/29] WIP Lipschitz --- skglm/solvers/fista.py | 17 ++++++++--------- toy_fista.py | 5 ++++- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index 226ed4f89..336e65450 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -1,6 +1,7 @@ import numpy as np from numba import njit from skglm.solvers.base import BaseSolver +from skglm.solvers.common import construct_grad @njit @@ -23,28 +24,27 @@ def __init__(self, max_iter=100, tol=1e-4, fit_intercept=False, warm_start=False self.opt_freq = opt_freq self.verbose = verbose - def solve(self, X, y, penalty, w_init=None, weights=None): - # needs a quadratic datafit, but works with L1, WeightedL1, SLOPE + def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): n_samples, n_features = X.shape all_features = np.arange(n_features) t_new = 1 w = w_init.copy() if w_init is not None else np.zeros(n_features) z = w_init.copy() if w_init is not None else np.zeros(n_features) - weights = weights if weights is not None else np.ones(n_features) + Xw = Xw_init.copy() if Xw_init is not None else np.zeros(n_samples) - # FISTA with Gram update - G = X.T @ X - Xty = X.T @ y + # line search? + # lipschitz = np.max(datafit.lipschitz) lipschitz = np.linalg.norm(X, ord=2) ** 2 / n_samples for n_iter in range(self.max_iter): t_old = t_new t_new = (1 + np.sqrt(1 + 4 * t_old ** 2)) / 2 w_old = w.copy() - grad = (G @ z - Xty) / n_samples + grad = construct_grad(X, y, z, X @ z, datafit, all_features) z -= grad / lipschitz w = _prox_vec(w, z, penalty, lipschitz) + Xw = X @ w z = w + (t_old - 1.) / t_new * (w - w_old) if n_iter % self.opt_freq == 0: @@ -52,8 +52,7 @@ def solve(self, X, y, penalty, w_init=None, weights=None): stop_crit = np.max(opt) if self.verbose: - p_obj = (np.sum((y - X @ w) ** 2) / (2 * n_samples) - + penalty.value(w)) + p_obj = datafit.value(y, w, Xw) + penalty.value(w) print( f"Iteration {n_iter+1}: {p_obj:.10f}, " f"stopping crit: {stop_crit:.2e}" diff --git a/toy_fista.py b/toy_fista.py index fb26f99be..476ed578d 100644 --- a/toy_fista.py +++ b/toy_fista.py @@ -1,5 +1,6 @@ import numpy as np from numpy.linalg import norm +from skglm.datafits.single_task import Quadratic from skglm.solvers import FISTA from skglm.penalties import L1 from skglm.estimators import Lasso @@ -18,8 +19,10 @@ tol = 1e-10 solver = FISTA(max_iter=max_iter, tol=tol, opt_freq=obj_freq, verbose=1) +datafit = compiled_clone(Quadratic()) +datafit.initialize(X, y) penalty = compiled_clone(L1(alpha)) -w = solver.solve(X, y, penalty) +w = solver.solve(X, y, datafit, penalty) clf = Lasso(alpha=alpha, tol=tol, fit_intercept=False) clf.fit(X, y) From e47c68a91532be49aa7f975cb782f5b05dcae894 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 14 Oct 2022 17:03:49 +0200 Subject: [PATCH 05/29] ADD global lipschitz constants --- skglm/datafits/single_task.py | 45 +++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 1f4d39503..4aefaac0a 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -22,6 +22,10 @@ class Quadratic(BaseDatafit): The coordinatewise gradient Lipschitz constants. Equal to norm(X, axis=0) ** 2 / n_samples. + global_lipschitz : float + Global Lipschitz constant. Equal to + norm(X, ord=2) ** 2 / n_samples. + Note ---- The class is jit compiled at fit time using Numba compiler. @@ -35,6 +39,7 @@ def get_spec(self): spec = ( ('Xty', float64[:]), ('lipschitz', float64[:]), + ('global_lipschitz', float64), ) return spec @@ -44,6 +49,7 @@ def params_to_dict(self): def initialize(self, X, y): self.Xty = X.T @ y n_features = X.shape[1] + self.global_lipschitz = norm(X, ord=2) ** 2 / len(y) self.lipschitz = np.zeros(n_features, dtype=X.dtype) for j in range(n_features): self.lipschitz[j] = (X[:, j] ** 2).sum() / len(y) @@ -53,15 +59,19 @@ def initialize_sparse( n_features = len(X_indptr) - 1 self.Xty = np.zeros(n_features, dtype=X_data.dtype) self.lipschitz = np.zeros(n_features, dtype=X_data.dtype) + self.global_lipschitz = 0. for j in range(n_features): nrm2 = 0. xty = 0 + x2 = 0. for idx in range(X_indptr[j], X_indptr[j + 1]): nrm2 += X_data[idx] ** 2 xty += X_data[idx] * y[X_indices[idx]] + x2 += X_data[idx] ** 2 / len(y) self.lipschitz[j] = nrm2 / len(y) self.Xty[j] = xty + self.global_lipschitz += x2 def value(self, y, w, Xw): return np.sum((y - Xw) ** 2) / (2 * len(Xw)) @@ -111,6 +121,10 @@ class Logistic(BaseDatafit): The coordinatewise gradient Lipschitz constants. Equal to norm(X, axis=0) ** 2 / (4 * n_samples). + global_lipschitz : float + Global Lipschitz constant. Equal to + norm(X, ord=2) ** 2 / (4 * n_samples). + Note ---- The class is jit compiled at fit time using Numba compiler. @@ -123,6 +137,7 @@ def __init__(self): def get_spec(self): spec = ( ('lipschitz', float64[:]), + ('global_lipschitz', float64), ) return spec @@ -140,13 +155,16 @@ def raw_hessian(self, y, Xw): def initialize(self, X, y): self.lipschitz = (X ** 2).sum(axis=0) / (len(y) * 4) + self.global_lipschitz = norm(X, ord=2) ** 2 / (len(y) * 4) def initialize_sparse(self, X_data, X_indptr, X_indices, y): n_features = len(X_indptr) - 1 self.lipschitz = np.zeros(n_features, dtype=X_data.dtype) + self.global_lipschitz = 0. for j in range(n_features): Xj = X_data[X_indptr[j]:X_indptr[j+1]] self.lipschitz[j] = (Xj ** 2).sum() / (len(y) * 4) + self.global_lipschitz += (Xj ** 2).sum() / (len(y) * 4) def value(self, y, w, Xw): return np.log(1. + np.exp(- y * Xw)).sum() / len(y) @@ -187,6 +205,11 @@ class QuadraticSVC(BaseDatafit): ---------- lipschitz : array, shape (n_features,) The coordinatewise gradient Lipschitz constants. + Equal to norm(yXT, axis=0) ** 2. + + global_lipschitz : float + Global Lipschitz constant. Equal to + norm(yXT, ord=2) ** 2. Note ---- @@ -200,6 +223,7 @@ def __init__(self): def get_spec(self): spec = ( ('lipschitz', float64[:]), + ('global_lipschitz', float64), ) return spec @@ -209,18 +233,22 @@ def params_to_dict(self): def initialize(self, yXT, y): n_features = yXT.shape[1] self.lipschitz = np.zeros(n_features, dtype=yXT.dtype) + self.global_lipschitz = 0. for j in range(n_features): self.lipschitz[j] = norm(yXT[:, j]) ** 2 + self.global_lipschitz += norm(yXT[:, j]) ** 2 def initialize_sparse( self, yXT_data, yXT_indptr, yXT_indices, y): n_features = len(yXT_indptr) - 1 self.lipschitz = np.zeros(n_features, dtype=yXT_data.dtype) + self.global_lipschitz = 0. for j in range(n_features): nrm2 = 0. for idx in range(yXT_indptr[j], yXT_indptr[j + 1]): nrm2 += yXT_data[idx] ** 2 self.lipschitz[j] = nrm2 + self.global_lipschitz += nrm2 def value(self, y, w, yXTw): return (yXTw ** 2).sum() / 2 - np.sum(w) @@ -264,8 +292,16 @@ class Huber(BaseDatafit): Attributes ---------- + delta : float + Shape hyperparameter. + lipschitz : array, shape (n_features,) - The coordinatewise gradient Lipschitz constants. + The coordinatewise gradient Lipschitz constants. Equal to + norm(X, axis=0) ** 2 / n_samples. + + global_lipschitz : float + Global Lipschitz constant. Equal to + norm(X, ord=2) ** 2 / n_samples. Note ---- @@ -279,7 +315,8 @@ def __init__(self, delta): def get_spec(self): spec = ( ('delta', float64), - ('lipschitz', float64[:]) + ('lipschitz', float64[:]), + ('global_lipschitz', float64), ) return spec @@ -289,18 +326,22 @@ def params_to_dict(self): def initialize(self, X, y): n_features = X.shape[1] self.lipschitz = np.zeros(n_features, dtype=X.dtype) + self.global_lipschitz = 0. for j in range(n_features): self.lipschitz[j] = (X[:, j] ** 2).sum() / len(y) + self.global_lipschitz += (X[:, j] ** 2).sum() / len(y) def initialize_sparse( self, X_data, X_indptr, X_indices, y): n_features = len(X_indptr) - 1 self.lipschitz = np.zeros(n_features, dtype=X_data.dtype) + self.global_lipschitz = 0. for j in range(n_features): nrm2 = 0. for idx in range(X_indptr[j], X_indptr[j + 1]): nrm2 += X_data[idx] ** 2 self.lipschitz[j] = nrm2 / len(y) + self.global_lipschitz += nrm2 / len(y) def value(self, y, w, Xw): n_samples = len(y) From 3635f241c31ac851da09d9c04cfbb17f9b7051c7 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 14 Oct 2022 17:10:49 +0200 Subject: [PATCH 06/29] FISTA with global lipschitz --- skglm/solvers/fista.py | 5 ++--- toy_fista.py | 25 +++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index 336e65450..8b9cebeeb 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -33,9 +33,8 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): z = w_init.copy() if w_init is not None else np.zeros(n_features) Xw = Xw_init.copy() if Xw_init is not None else np.zeros(n_samples) - # line search? - # lipschitz = np.max(datafit.lipschitz) - lipschitz = np.linalg.norm(X, ord=2) ** 2 / n_samples + # TODO: OR line search + lipschitz = datafit.global_lipschitz for n_iter in range(self.max_iter): t_old = t_new diff --git a/toy_fista.py b/toy_fista.py index 476ed578d..1c29bf713 100644 --- a/toy_fista.py +++ b/toy_fista.py @@ -1,9 +1,9 @@ import numpy as np from numpy.linalg import norm -from skglm.datafits.single_task import Quadratic +from skglm.datafits.single_task import Quadratic, Logistic from skglm.solvers import FISTA from skglm.penalties import L1 -from skglm.estimators import Lasso +from skglm.estimators import SparseLogisticRegression, Lasso from skglm.utils import make_correlated_data, compiled_clone @@ -19,6 +19,10 @@ tol = 1e-10 solver = FISTA(max_iter=max_iter, tol=tol, opt_freq=obj_freq, verbose=1) + +############## +# Quadratic +############## datafit = compiled_clone(Quadratic()) datafit.initialize(X, y) penalty = compiled_clone(L1(alpha)) @@ -28,3 +32,20 @@ clf.fit(X, y) np.testing.assert_allclose(w, clf.coef_, rtol=1e-5) + +############## +# Logistic +############## +y = np.sign(y) +alpha_max = norm(X.T @ y, ord=np.inf) / (4 * n_samples) +alpha = alpha_max / 10 + +datafit = compiled_clone(Logistic()) +datafit.initialize(X, y) +penalty = compiled_clone(L1(alpha)) +w = solver.solve(X, y, datafit, penalty) + +clf = SparseLogisticRegression(alpha=alpha, tol=tol, fit_intercept=False) +clf.fit(X, y) + +np.testing.assert_allclose(w, np.squeeze(clf.coef_), rtol=1e-3) From 488011236c5b411a57572cb2544e16a023b15454 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 14 Oct 2022 21:25:15 +0200 Subject: [PATCH 07/29] writing tests --- skglm/datafits/single_task.py | 7 ++---- skglm/solvers/fista.py | 38 ++++++++++++++++++++++------ skglm/tests/test_fista.py | 47 +++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 12 deletions(-) create mode 100644 skglm/tests/test_fista.py diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 4aefaac0a..583312083 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -63,15 +63,13 @@ def initialize_sparse( for j in range(n_features): nrm2 = 0. xty = 0 - x2 = 0. for idx in range(X_indptr[j], X_indptr[j + 1]): nrm2 += X_data[idx] ** 2 xty += X_data[idx] * y[X_indices[idx]] - x2 += X_data[idx] ** 2 / len(y) self.lipschitz[j] = nrm2 / len(y) self.Xty[j] = xty - self.global_lipschitz += x2 + self.global_lipschitz += nrm2 / len(y) def value(self, y, w, Xw): return np.sum((y - Xw) ** 2) / (2 * len(Xw)) @@ -233,10 +231,9 @@ def params_to_dict(self): def initialize(self, yXT, y): n_features = yXT.shape[1] self.lipschitz = np.zeros(n_features, dtype=yXT.dtype) - self.global_lipschitz = 0. + self.global_lipschitz = norm(yXT, ord=2) ** 2 / len(y) for j in range(n_features): self.lipschitz[j] = norm(yXT[:, j]) ** 2 - self.global_lipschitz += norm(yXT[:, j]) ** 2 def initialize_sparse( self, yXT_data, yXT_indptr, yXT_indices, y): diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index 8b9cebeeb..aaafca104 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -13,14 +13,35 @@ def _prox_vec(w, z, penalty, lipschitz): class FISTA(BaseSolver): - r"""ISTA solver with Nesterov acceleration (FISTA).""" + r"""ISTA solver with Nesterov acceleration (FISTA). - def __init__(self, max_iter=100, tol=1e-4, fit_intercept=False, warm_start=False, - opt_freq=10, verbose=0): + This solver implements accelerated proximal gradient descent for linear problems. + + Attributes + ---------- + max_iter : int, default 100 + Maximum number of iterations. + + tol : float, default 1e-4 + Tolerance for convergence. + + opt_freq : int, default 10 + Frequency for optimality condition check. + + verbose : bool, default False + Amount of verbosity. 0/False is silent. + + References + ---------- + .. [1] Beck, A. and Teboulle M. + "A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse + problems", 2009, SIAM J. Imaging Sci. + https://epubs.siam.org/doi/10.1137/080716542 + """ + + def __init__(self, max_iter=100, tol=1e-4, opt_freq=10, verbose=0): self.max_iter = max_iter self.tol = tol - self.fit_intercept = fit_intercept - self.warm_start = warm_start self.opt_freq = opt_freq self.verbose = verbose @@ -33,8 +54,11 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): z = w_init.copy() if w_init is not None else np.zeros(n_features) Xw = Xw_init.copy() if Xw_init is not None else np.zeros(n_samples) - # TODO: OR line search - lipschitz = datafit.global_lipschitz + if hasattr(datafit, "global_lipschitz"): + lipschitz = datafit.global_lipschitz + else: + # TODO: OR line search + raise Exception("Line search is not yet implemented for FISTA solver.") for n_iter in range(self.max_iter): t_old = t_new diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py new file mode 100644 index 000000000..e921c146f --- /dev/null +++ b/skglm/tests/test_fista.py @@ -0,0 +1,47 @@ +import pytest + +import numpy as np +from numpy.linalg import norm + +from skglm.datafits import Quadratic, Logistic, QuadraticSVC +from skglm.estimators import Lasso, LinearSVC, SparseLogisticRegression +from skglm.penalties import L1, IndicatorBox +from skglm.solvers import FISTA +from skglm.utils import make_correlated_data, compiled_clone + + +n_samples, n_features = 10, 20 +X, y, _ = make_correlated_data( + n_samples=n_samples, n_features=n_features, random_state=0) +y_classif = np.sign(y) + +alpha_max = norm(X.T @ y, ord=np.inf) / len(y) +alpha = alpha_max / 100 + +tol = 1e-8 + +# TODO: use GeneralizedLinearEstimator (to test global lipschtiz constants of every datafit) +# TODO: test sparse matrices (global lipschitz constants) +@pytest.mark.parametrize("Datafit, Penalty, Estimator", [ + (Quadratic, L1, Lasso), + (Logistic, L1, SparseLogisticRegression), + (QuadraticSVC, IndicatorBox, LinearSVC), +]) +def test_fista_solver(Datafit, Penalty, Estimator): + _y = y if isinstance(Datafit, Quadratic) else y_classif + datafit = compiled_clone(Datafit()) + _init = y @ X.T if isinstance(Datafit, QuadraticSVC) else X + datafit.initialize(_init, _y) + penalty = compiled_clone(Penalty(alpha)) + + solver = FISTA(max_iter=1000, tol=tol) + w = solver.solve(X, _y, datafit, penalty) + + estimator = Estimator(alpha, tol=tol, fit_intercept=False) + estimator.fit(X, _y) + + np.testing.assert_allclose(w, estimator.coef_.flatten(), rtol=1e-3) + + +if __name__ == '__main__': + pass From 46a9a76fe3d88b1779729352013f5d5be053cf67 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 14 Oct 2022 22:20:07 +0200 Subject: [PATCH 08/29] better tests --- skglm/tests/test_fista.py | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py index e921c146f..04c1ed516 100644 --- a/skglm/tests/test_fista.py +++ b/skglm/tests/test_fista.py @@ -2,45 +2,50 @@ import numpy as np from numpy.linalg import norm +from scipy.sparse import csc_matrix, issparse from skglm.datafits import Quadratic, Logistic, QuadraticSVC -from skglm.estimators import Lasso, LinearSVC, SparseLogisticRegression from skglm.penalties import L1, IndicatorBox -from skglm.solvers import FISTA +from skglm.solvers import FISTA, AndersonCD from skglm.utils import make_correlated_data, compiled_clone -n_samples, n_features = 10, 20 +np.random.seed(0) +n_samples, n_features = 50, 60 X, y, _ = make_correlated_data( n_samples=n_samples, n_features=n_features, random_state=0) +X_sparse = csc_matrix(X * np.random.binomial(1, 0.1, X.shape)) y_classif = np.sign(y) alpha_max = norm(X.T @ y, ord=np.inf) / len(y) alpha = alpha_max / 100 -tol = 1e-8 +tol = 1e-10 -# TODO: use GeneralizedLinearEstimator (to test global lipschtiz constants of every datafit) -# TODO: test sparse matrices (global lipschitz constants) -@pytest.mark.parametrize("Datafit, Penalty, Estimator", [ - (Quadratic, L1, Lasso), - (Logistic, L1, SparseLogisticRegression), - (QuadraticSVC, IndicatorBox, LinearSVC), + +@pytest.mark.parametrize("X", [X, X_sparse]) +@pytest.mark.parametrize("Datafit, Penalty", [ + (Quadratic, L1), + (Logistic, L1), + (QuadraticSVC, IndicatorBox), ]) -def test_fista_solver(Datafit, Penalty, Estimator): +def test_fista_solver(X, Datafit, Penalty): _y = y if isinstance(Datafit, Quadratic) else y_classif datafit = compiled_clone(Datafit()) _init = y @ X.T if isinstance(Datafit, QuadraticSVC) else X - datafit.initialize(_init, _y) + if issparse(X): + datafit.initialize_sparse(_init.data, _init.indptr, _init.indices, _y) + else: + datafit.initialize(_init, _y) penalty = compiled_clone(Penalty(alpha)) - solver = FISTA(max_iter=1000, tol=tol) + solver = FISTA(max_iter=1000, tol=tol, opt_freq=1) w = solver.solve(X, _y, datafit, penalty) - estimator = Estimator(alpha, tol=tol, fit_intercept=False) - estimator.fit(X, _y) + solver_cd = AndersonCD(tol=tol, fit_intercept=False) + w_cd = solver_cd.solve(X, _y, datafit, penalty)[0] - np.testing.assert_allclose(w, estimator.coef_.flatten(), rtol=1e-3) + np.testing.assert_allclose(w, w_cd, rtol=1e-3) if __name__ == '__main__': From 9f0653a5165d278c84774ccc3d8cbe64b59cb4d7 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 14 Oct 2022 22:20:42 +0200 Subject: [PATCH 09/29] support sparse matrices --- skglm/solvers/fista.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index aaafca104..b058033a5 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -1,7 +1,8 @@ import numpy as np +from scipy.sparse import issparse from numba import njit from skglm.solvers.base import BaseSolver -from skglm.solvers.common import construct_grad +from skglm.solvers.common import construct_grad, construct_grad_sparse @njit @@ -64,7 +65,11 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): t_old = t_new t_new = (1 + np.sqrt(1 + 4 * t_old ** 2)) / 2 w_old = w.copy() - grad = construct_grad(X, y, z, X @ z, datafit, all_features) + if issparse(X): + grad = construct_grad_sparse( + X.data, X.indptr, X.indices, y, z, X @ z, datafit, all_features) + else: + grad = construct_grad(X, y, z, X @ z, datafit, all_features) z -= grad / lipschitz w = _prox_vec(w, z, penalty, lipschitz) Xw = X @ w From fe159beed85fcdfb89364d23fd2bd47a455b5f63 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 14 Oct 2022 22:20:50 +0200 Subject: [PATCH 10/29] fix mistake --- skglm/datafits/single_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 583312083..df6b860b2 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -231,7 +231,7 @@ def params_to_dict(self): def initialize(self, yXT, y): n_features = yXT.shape[1] self.lipschitz = np.zeros(n_features, dtype=yXT.dtype) - self.global_lipschitz = norm(yXT, ord=2) ** 2 / len(y) + self.global_lipschitz = norm(yXT, ord=2) ** 2 for j in range(n_features): self.lipschitz[j] = norm(yXT[:, j]) ** 2 From 8e74e8a232894fb2f92cef3dc9582b557b05da85 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 14 Oct 2022 22:32:35 +0200 Subject: [PATCH 11/29] RM toy_fista --- toy_fista.py | 51 --------------------------------------------------- 1 file changed, 51 deletions(-) delete mode 100644 toy_fista.py diff --git a/toy_fista.py b/toy_fista.py deleted file mode 100644 index 1c29bf713..000000000 --- a/toy_fista.py +++ /dev/null @@ -1,51 +0,0 @@ -import numpy as np -from numpy.linalg import norm -from skglm.datafits.single_task import Quadratic, Logistic -from skglm.solvers import FISTA -from skglm.penalties import L1 -from skglm.estimators import SparseLogisticRegression, Lasso -from skglm.utils import make_correlated_data, compiled_clone - - -X, y, _ = make_correlated_data(n_samples=200, n_features=100, random_state=24) - -n_samples, n_features = X.shape -alpha_max = norm(X.T @ y, ord=np.inf) / n_samples - -alpha = alpha_max / 10 - -max_iter = 1000 -obj_freq = 100 -tol = 1e-10 - -solver = FISTA(max_iter=max_iter, tol=tol, opt_freq=obj_freq, verbose=1) - -############## -# Quadratic -############## -datafit = compiled_clone(Quadratic()) -datafit.initialize(X, y) -penalty = compiled_clone(L1(alpha)) -w = solver.solve(X, y, datafit, penalty) - -clf = Lasso(alpha=alpha, tol=tol, fit_intercept=False) -clf.fit(X, y) - -np.testing.assert_allclose(w, clf.coef_, rtol=1e-5) - -############## -# Logistic -############## -y = np.sign(y) -alpha_max = norm(X.T @ y, ord=np.inf) / (4 * n_samples) -alpha = alpha_max / 10 - -datafit = compiled_clone(Logistic()) -datafit.initialize(X, y) -penalty = compiled_clone(L1(alpha)) -w = solver.solve(X, y, datafit, penalty) - -clf = SparseLogisticRegression(alpha=alpha, tol=tol, fit_intercept=False) -clf.fit(X, y) - -np.testing.assert_allclose(w, np.squeeze(clf.coef_), rtol=1e-3) From a24ed9c6d5156bf02b4fee850cacd9484d11738a Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 14 Oct 2022 22:37:24 +0200 Subject: [PATCH 12/29] green --- skglm/solvers/fista.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index b058033a5..dc2f60ccb 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -67,7 +67,7 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): w_old = w.copy() if issparse(X): grad = construct_grad_sparse( - X.data, X.indptr, X.indices, y, z, X @ z, datafit, all_features) + X.data, X.indptr, X.indices, y, z, X @ z, datafit, all_features) else: grad = construct_grad(X, y, z, X @ z, datafit, all_features) z -= grad / lipschitz From 4362c2c02c3d3c0303eb2b75d306afa253e7dba9 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Sun, 16 Oct 2022 14:10:55 +0200 Subject: [PATCH 13/29] mv `_prox_vec` to utils --- skglm/solvers/fista.py | 12 ++---------- skglm/utils.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index dc2f60ccb..7214d0c2e 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -1,16 +1,8 @@ import numpy as np from scipy.sparse import issparse -from numba import njit from skglm.solvers.base import BaseSolver from skglm.solvers.common import construct_grad, construct_grad_sparse - - -@njit -def _prox_vec(w, z, penalty, lipschitz): - n_features = w.shape[0] - for j in range(n_features): - w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j) - return w +from skglm.utils import prox_vec class FISTA(BaseSolver): @@ -71,7 +63,7 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): else: grad = construct_grad(X, y, z, X @ z, datafit, all_features) z -= grad / lipschitz - w = _prox_vec(w, z, penalty, lipschitz) + w = prox_vec(w, z, penalty, lipschitz) Xw = X @ w z = w + (t_old - 1.) / t_new * (w - w_old) diff --git a/skglm/utils.py b/skglm/utils.py index 55c2fa462..bffa72c77 100644 --- a/skglm/utils.py +++ b/skglm/utils.py @@ -457,3 +457,32 @@ def extrapolate(self, w, Xw): C = inv_UTU_ones / np.sum(inv_UTU_ones) # floating point errors may cause w and Xw to disagree return self.arr_w_[:, 1:] @ C, self.arr_Xw_[:, 1:] @ C, True + + +@njit +def prox_vec(w, z, penalty, lipschitz): + """Evaluate the vectorized proximal operator for the FISTA solver. + + Parameters + ---------- + w : array, shape (n_features,) + Coefficient vector. + + z : array, shape (n_features,) + FISTA auxiliary variable. + + penalty : instance of Penalty. + Penalty object. + + lipschitz : float + Global Lipschitz constant. + + Returns + ------- + w : array; shape (n_features,) + Updated coefficient vector. + """ + n_features = w.shape[0] + for j in range(n_features): + w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j) + return w From 2665d5d1853cde35361e0176647b1c2f84f36412 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Sun, 16 Oct 2022 14:11:58 +0200 Subject: [PATCH 14/29] rm `opt_freq` --- skglm/solvers/fista.py | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index 7214d0c2e..4b1e709d8 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -18,9 +18,6 @@ class FISTA(BaseSolver): tol : float, default 1e-4 Tolerance for convergence. - opt_freq : int, default 10 - Frequency for optimality condition check. - verbose : bool, default False Amount of verbosity. 0/False is silent. @@ -32,10 +29,9 @@ class FISTA(BaseSolver): https://epubs.siam.org/doi/10.1137/080716542 """ - def __init__(self, max_iter=100, tol=1e-4, opt_freq=10, verbose=0): + def __init__(self, max_iter=100, tol=1e-4, verbose=0): self.max_iter = max_iter self.tol = tol - self.opt_freq = opt_freq self.verbose = verbose def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): @@ -67,19 +63,18 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): Xw = X @ w z = w + (t_old - 1.) / t_new * (w - w_old) - if n_iter % self.opt_freq == 0: - opt = penalty.subdiff_distance(w, grad, all_features) - stop_crit = np.max(opt) + opt = penalty.subdiff_distance(w, grad, all_features) + stop_crit = np.max(opt) - if self.verbose: - p_obj = datafit.value(y, w, Xw) + penalty.value(w) - print( - f"Iteration {n_iter+1}: {p_obj:.10f}, " - f"stopping crit: {stop_crit:.2e}" - ) + if self.verbose: + p_obj = datafit.value(y, w, Xw) + penalty.value(w) + print( + f"Iteration {n_iter+1}: {p_obj:.10f}, " + f"stopping crit: {stop_crit:.2e}" + ) - if stop_crit < self.tol: - if self.verbose: - print(f"Stopping criterion max violation: {stop_crit:.2e}") - break + if stop_crit < self.tol: + if self.verbose: + print(f"Stopping criterion max violation: {stop_crit:.2e}") + break return w From 2e408bc6100ba993527bbc0bbbb234099dd3c70f Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Sun, 16 Oct 2022 14:22:11 +0200 Subject: [PATCH 15/29] fix tests --- skglm/solvers/fista.py | 6 ++++-- skglm/tests/test_fista.py | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index 4b1e709d8..0732d066f 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -35,6 +35,7 @@ def __init__(self, max_iter=100, tol=1e-4, verbose=0): self.verbose = verbose def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): + p_objs_out = [] n_samples, n_features = X.shape all_features = np.arange(n_features) t_new = 1 @@ -66,8 +67,9 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): opt = penalty.subdiff_distance(w, grad, all_features) stop_crit = np.max(opt) + p_obj = datafit.value(y, w, Xw) + penalty.value(w) + p_objs_out.append(p_obj) if self.verbose: - p_obj = datafit.value(y, w, Xw) + penalty.value(w) print( f"Iteration {n_iter+1}: {p_obj:.10f}, " f"stopping crit: {stop_crit:.2e}" @@ -77,4 +79,4 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): if self.verbose: print(f"Stopping criterion max violation: {stop_crit:.2e}") break - return w + return w, np.array(p_objs_out), stop_crit diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py index 04c1ed516..f490c11a4 100644 --- a/skglm/tests/test_fista.py +++ b/skglm/tests/test_fista.py @@ -39,13 +39,13 @@ def test_fista_solver(X, Datafit, Penalty): datafit.initialize(_init, _y) penalty = compiled_clone(Penalty(alpha)) - solver = FISTA(max_iter=1000, tol=tol, opt_freq=1) - w = solver.solve(X, _y, datafit, penalty) + solver = FISTA(max_iter=1000, tol=tol) + res_fista = solver.solve(X, _y, datafit, penalty) solver_cd = AndersonCD(tol=tol, fit_intercept=False) - w_cd = solver_cd.solve(X, _y, datafit, penalty)[0] + res_cd = solver_cd.solve(X, _y, datafit, penalty) - np.testing.assert_allclose(w, w_cd, rtol=1e-3) + np.testing.assert_allclose(res_fista[0], res_cd[0], rtol=1e-3) if __name__ == '__main__': From 8524cf7b0180b061d27e2522bd8ebb09d1fbae5a Mon Sep 17 00:00:00 2001 From: PAB Date: Sun, 16 Oct 2022 14:23:26 +0200 Subject: [PATCH 16/29] Update skglm/solvers/fista.py Co-authored-by: Badr MOUFAD <65614794+Badr-MOUFAD@users.noreply.github.com> --- skglm/solvers/fista.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index 0732d066f..1779dd3e3 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -38,7 +38,7 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): p_objs_out = [] n_samples, n_features = X.shape all_features = np.arange(n_features) - t_new = 1 + t_new = 1. w = w_init.copy() if w_init is not None else np.zeros(n_features) z = w_init.copy() if w_init is not None else np.zeros(n_features) From dd658f8954ff8edd7f1a8a818c9510b3beb4f13a Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Sun, 16 Oct 2022 14:27:06 +0200 Subject: [PATCH 17/29] huber comment --- skglm/datafits/single_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index df6b860b2..3bf1eee0e 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -290,7 +290,7 @@ class Huber(BaseDatafit): Attributes ---------- delta : float - Shape hyperparameter. + Threshold hyperparameter. lipschitz : array, shape (n_features,) The coordinatewise gradient Lipschitz constants. Equal to From cbc54182d1215e848241780c7fbd21ed836f45d1 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Sun, 16 Oct 2022 19:05:19 +0200 Subject: [PATCH 18/29] WIP --- skglm/datafits/single_task.py | 3 +-- skglm/utils.py | 41 +++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 3bf1eee0e..0fd19dbe5 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -158,11 +158,10 @@ def initialize(self, X, y): def initialize_sparse(self, X_data, X_indptr, X_indices, y): n_features = len(X_indptr) - 1 self.lipschitz = np.zeros(n_features, dtype=X_data.dtype) - self.global_lipschitz = 0. for j in range(n_features): Xj = X_data[X_indptr[j]:X_indptr[j+1]] self.lipschitz[j] = (Xj ** 2).sum() / (len(y) * 4) - self.global_lipschitz += (Xj ** 2).sum() / (len(y) * 4) + self.global_lipschitz = 0. def value(self, y, w, Xw): return np.log(1. + np.exp(- y * Xw)).sum() / len(y) diff --git a/skglm/utils.py b/skglm/utils.py index bffa72c77..23215d244 100644 --- a/skglm/utils.py +++ b/skglm/utils.py @@ -486,3 +486,44 @@ def prox_vec(w, z, penalty, lipschitz): for j in range(n_features): w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j) return w + + +@njit +def power_method(X_data, X_indptr, X_indices, n_iter): + """Power method to compute largest eigenvalue of sparse matrix X. + + Parameters + ---------- + X_data : array, shape (n_elements,) + `data` attribute of the sparse CSC matrix X. + + X_indptr : array, shape (n_features + 1,) + `indptr` attribute of the sparse CSC matrix X. + + X_indices : array, shape (n_elements,) + `indices` attribute of the sparse CSC matrix X. + + n_iter : int + Number of iterations for the power method. + + Returns + ------- + rayleigh : float + Rayleigh quotient or spectral radius of X^TX + """ + n_features = len(X_indptr) - 1 + b_k = np.random.rand(n_features) + for _ in range(n_iter): + b_k1 = np.zeros(n_features) + for j in range(n_features): + b_k1[j] = + b_k1 = A @ b_k # write the full loop + b_k1_nrm = norm(b_k1) + b_k = b_k1 / b_k1_nrm + rayleigh = b_k1 @ b_k / (norm(b_k) ** 2) + return rayleigh + + + + + From e76dfb10f8f507ea7ed49ac3e9a3db926719d6d5 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Fri, 21 Oct 2022 01:00:57 +0200 Subject: [PATCH 19/29] implement power method --- skglm/tests/test_fista.py | 12 +++- skglm/utils.py | 116 ++++++++++++++++++++++++++++++-------- 2 files changed, 104 insertions(+), 24 deletions(-) diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py index f490c11a4..617c71848 100644 --- a/skglm/tests/test_fista.py +++ b/skglm/tests/test_fista.py @@ -7,7 +7,7 @@ from skglm.datafits import Quadratic, Logistic, QuadraticSVC from skglm.penalties import L1, IndicatorBox from skglm.solvers import FISTA, AndersonCD -from skglm.utils import make_correlated_data, compiled_clone +from skglm.utils import make_correlated_data, compiled_clone, spectral_norm2 np.random.seed(0) @@ -39,7 +39,7 @@ def test_fista_solver(X, Datafit, Penalty): datafit.initialize(_init, _y) penalty = compiled_clone(Penalty(alpha)) - solver = FISTA(max_iter=1000, tol=tol) + solver = FISTA(max_iter=1000, tol=tol) res_fista = solver.solve(X, _y, datafit, penalty) solver_cd = AndersonCD(tol=tol, fit_intercept=False) @@ -48,5 +48,13 @@ def test_fista_solver(X, Datafit, Penalty): np.testing.assert_allclose(res_fista[0], res_cd[0], rtol=1e-3) +def test_spectral_norm2(): + X_bundles = (X_sparse.data, X_sparse.indptr, X_sparse.indices) + lipschitz_our = spectral_norm2(*X_bundles, n_samples=len(y)) + lipschitz_np = norm(X_sparse.toarray(), ord=2) ** 2 + + np.testing.assert_allclose(lipschitz_our, lipschitz_np, atol=1e-4, rtol=1e-5) + + if __name__ == '__main__': pass diff --git a/skglm/utils.py b/skglm/utils.py index 23215d244..8b791f033 100644 --- a/skglm/utils.py +++ b/skglm/utils.py @@ -488,42 +488,114 @@ def prox_vec(w, z, penalty, lipschitz): return w +POWER_MAX_ITER = 20 +TOL = 1e-6 ** 2 + + @njit -def power_method(X_data, X_indptr, X_indices, n_iter): - """Power method to compute largest eigenvalue of sparse matrix X. - +def spectral_norm2(X_data, X_indptr, X_indices, n_samples): + """Compute the squared spectral norm of sparse matrix ``X``. + + Find the largest eigenvalue of ``X @ X.T`` using the power method. + Parameters ---------- X_data : array, shape (n_elements,) - `data` attribute of the sparse CSC matrix X. + `data` attribute of the sparse CSC matrix ``X``. X_indptr : array, shape (n_features + 1,) - `indptr` attribute of the sparse CSC matrix X. + `indptr` attribute of the sparse CSC matrix ``X``. X_indices : array, shape (n_elements,) - `indices` attribute of the sparse CSC matrix X. - - n_iter : int - Number of iterations for the power method. - + `indices` attribute of the sparse CSC matrix ``X``. + + n_samples : int + number of rows of ``X``. + Returns ------- - rayleigh : float - Rayleigh quotient or spectral radius of X^TX + eigenvalue : float + The largest eigenvalue value of ``X.T @ X``, aka the squared spectral of ``X``. + + References + ---------- + .. [1] Alfio Quarteroni, Riccardo Sacco, Fausto Saleri "Numerical Mathematics", + chapiter 5, page 192-195. """ + # init vec with norm(vec) == 1. + eigenvector = np.full(n_samples, 1/np.sqrt(n_samples)) + eigenvalue = 1. + + for _ in range(POWER_MAX_ITER): + vec = _XXT_dot_vec(X_data, X_indptr, X_indices, eigenvector, n_samples) + norm_vec = norm(vec) + eigenvalue = vec @ eigenvector + + # norm(X @ X.T @ eigenvector - eigenvalue * eigenvector) <= tol + if norm_vec ** 2 - eigenvalue ** 2 <= TOL: + break + + eigenvector = vec / norm_vec + + return eigenvalue + + +@njit +def _XXT_dot_vec(X_data, X_indptr, X_indices, vec, n_samples): + # computes X @ X.T @ vec, with X csc encoded + return _X_dot_vec(X_data, X_indptr, X_indices, + _XT_dot_vec(X_data, X_indptr, X_indices, vec), + n_samples) + + +@njit +def _X_dot_vec(X_data, X_indptr, X_indices, vec, n_samples): + # compute X @ vec, with X csc encoded + result = np.zeros(n_samples) + + # loop over features + for j in range(len(X_indptr) - 1): + if vec[j] == 0: + continue + + col_j_rows_idx = slice(X_indptr[j], X_indptr[j+1]) + result[X_indices[col_j_rows_idx]] += vec[j] * X_data[col_j_rows_idx] + + return result + + +@njit +def _XT_dot_vec(X_data, X_indptr, X_indices, vec): + # compute X.T @ vec, with X csc encoded n_features = len(X_indptr) - 1 - b_k = np.random.rand(n_features) - for _ in range(n_iter): - b_k1 = np.zeros(n_features) - for j in range(n_features): - b_k1[j] = - b_k1 = A @ b_k # write the full loop - b_k1_nrm = norm(b_k1) - b_k = b_k1 / b_k1_nrm - rayleigh = b_k1 @ b_k / (norm(b_k) ** 2) - return rayleigh + result = np.zeros(n_features) + + for j in range(n_features): + for idx in range(X_indptr[j], X_indptr[j+1]): + result[j] += X_data[idx] * vec[X_indices[idx]] + + return result + +if __name__ == '__main__': + from scipy.sparse import csc_matrix, random + import time + n_samples, n_features = 500, 600 + A = random(n_samples, n_features, density=0.5, format='csc') + X = csc_matrix(A) + X_dense = X.toarray() + # cache numba compilation + M = random(5, 7, density=0.9, format='csc') + spectral_norm2(M.data, M.indptr, M.indices, 5) + start = time.time() + spectral_norm2(X.data, X.indptr, X.indices, n_samples) + end = time.time() + print("our: ", end - start) + start = time.time() + norm(X_dense, ord=2) ** 2 + end = time.time() + print("np: ", end - start) From 2a4bce31274063476b6fe0aa313e39265cd07c3c Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Fri, 21 Oct 2022 01:07:07 +0200 Subject: [PATCH 20/29] private ``prox_vec`` --- skglm/solvers/fista.py | 4 ++-- skglm/utils.py | 25 +++---------------------- 2 files changed, 5 insertions(+), 24 deletions(-) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index 1779dd3e3..417af1a7e 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -2,7 +2,7 @@ from scipy.sparse import issparse from skglm.solvers.base import BaseSolver from skglm.solvers.common import construct_grad, construct_grad_sparse -from skglm.utils import prox_vec +from skglm.utils import _prox_vec class FISTA(BaseSolver): @@ -60,7 +60,7 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): else: grad = construct_grad(X, y, z, X @ z, datafit, all_features) z -= grad / lipschitz - w = prox_vec(w, z, penalty, lipschitz) + w = _prox_vec(w, z, penalty, lipschitz) Xw = X @ w z = w + (t_old - 1.) / t_new * (w - w_old) diff --git a/skglm/utils.py b/skglm/utils.py index 8b791f033..230719a0c 100644 --- a/skglm/utils.py +++ b/skglm/utils.py @@ -460,28 +460,9 @@ def extrapolate(self, w, Xw): @njit -def prox_vec(w, z, penalty, lipschitz): - """Evaluate the vectorized proximal operator for the FISTA solver. - - Parameters - ---------- - w : array, shape (n_features,) - Coefficient vector. - - z : array, shape (n_features,) - FISTA auxiliary variable. - - penalty : instance of Penalty. - Penalty object. - - lipschitz : float - Global Lipschitz constant. - - Returns - ------- - w : array; shape (n_features,) - Updated coefficient vector. - """ +def _prox_vec(w, z, penalty, lipschitz): + # evaluate the vectorized proximal operator for the FISTA solver + # lipschitz stands for global lipschitz constant n_features = w.shape[0] for j in range(n_features): w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j) From cd39a62290aa3c62a4d68a5c98d2811125c7d74d Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Fri, 21 Oct 2022 10:22:33 +0200 Subject: [PATCH 21/29] random init in power method && default args --- skglm/utils.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/skglm/utils.py b/skglm/utils.py index 230719a0c..75c993c57 100644 --- a/skglm/utils.py +++ b/skglm/utils.py @@ -469,12 +469,9 @@ def _prox_vec(w, z, penalty, lipschitz): return w -POWER_MAX_ITER = 20 -TOL = 1e-6 ** 2 - - @njit -def spectral_norm2(X_data, X_indptr, X_indices, n_samples): +def spectral_norm2(X_data, X_indptr, X_indices, n_samples, + max_iter=20, tol=1e-6): """Compute the squared spectral norm of sparse matrix ``X``. Find the largest eigenvalue of ``X @ X.T`` using the power method. @@ -503,17 +500,22 @@ def spectral_norm2(X_data, X_indptr, X_indices, n_samples): .. [1] Alfio Quarteroni, Riccardo Sacco, Fausto Saleri "Numerical Mathematics", chapiter 5, page 192-195. """ + # tol is squared as we evaluate the square of inequality (5.25) in ref [1] + tol = tol ** 2 + # init vec with norm(vec) == 1. - eigenvector = np.full(n_samples, 1/np.sqrt(n_samples)) + eigenvector = np.random.randn(n_samples) + eigenvector /= norm(eigenvector) eigenvalue = 1. - for _ in range(POWER_MAX_ITER): + for _ in range(max_iter): vec = _XXT_dot_vec(X_data, X_indptr, X_indices, eigenvector, n_samples) norm_vec = norm(vec) eigenvalue = vec @ eigenvector # norm(X @ X.T @ eigenvector - eigenvalue * eigenvector) <= tol - if norm_vec ** 2 - eigenvalue ** 2 <= TOL: + # inequality is squared + if norm_vec ** 2 - eigenvalue ** 2 <= tol: break eigenvector = vec / norm_vec From 0e4d42a49c82341a4a864742d3998c02ae07fc3e Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Fri, 21 Oct 2022 11:45:56 +0200 Subject: [PATCH 22/29] use power method for ``global_lipschitz`` --- skglm/datafits/single_task.py | 33 ++++++++++++++++++++------------- skglm/utils.py | 2 +- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 0fd19dbe5..c2bd741ff 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -4,6 +4,7 @@ from numba import float64 from skglm.datafits.base import BaseDatafit +from skglm.utils import spectral_norm2 class Quadratic(BaseDatafit): @@ -54,12 +55,14 @@ def initialize(self, X, y): for j in range(n_features): self.lipschitz[j] = (X[:, j] ** 2).sum() / len(y) - def initialize_sparse( - self, X_data, X_indptr, X_indices, y): + def initialize_sparse(self, X_data, X_indptr, X_indices, y): n_features = len(X_indptr) - 1 self.Xty = np.zeros(n_features, dtype=X_data.dtype) + + self.global_lipschitz = spectral_norm2(X_data, X_indptr, X_indices, len(y)) + self.global_lipschitz /= len(y) + self.lipschitz = np.zeros(n_features, dtype=X_data.dtype) - self.global_lipschitz = 0. for j in range(n_features): nrm2 = 0. xty = 0 @@ -69,7 +72,6 @@ def initialize_sparse( self.lipschitz[j] = nrm2 / len(y) self.Xty[j] = xty - self.global_lipschitz += nrm2 / len(y) def value(self, y, w, Xw): return np.sum((y - Xw) ** 2) / (2 * len(Xw)) @@ -157,11 +159,14 @@ def initialize(self, X, y): def initialize_sparse(self, X_data, X_indptr, X_indices, y): n_features = len(X_indptr) - 1 + + self.global_lipschitz = spectral_norm2(X_data, X_indptr, X_indices, len(y)) + self.global_lipschitz /= (len(y) * 4) + self.lipschitz = np.zeros(n_features, dtype=X_data.dtype) for j in range(n_features): Xj = X_data[X_indptr[j]:X_indptr[j+1]] self.lipschitz[j] = (Xj ** 2).sum() / (len(y) * 4) - self.global_lipschitz = 0. def value(self, y, w, Xw): return np.log(1. + np.exp(- y * Xw)).sum() / len(y) @@ -234,17 +239,18 @@ def initialize(self, yXT, y): for j in range(n_features): self.lipschitz[j] = norm(yXT[:, j]) ** 2 - def initialize_sparse( - self, yXT_data, yXT_indptr, yXT_indices, y): + def initialize_sparse(self, yXT_data, yXT_indptr, yXT_indices, y): n_features = len(yXT_indptr) - 1 + + self.global_lipschitz = spectral_norm2( + yXT_data, yXT_indptr, yXT_indices, len(y)) + self.lipschitz = np.zeros(n_features, dtype=yXT_data.dtype) - self.global_lipschitz = 0. for j in range(n_features): nrm2 = 0. for idx in range(yXT_indptr[j], yXT_indptr[j + 1]): nrm2 += yXT_data[idx] ** 2 self.lipschitz[j] = nrm2 - self.global_lipschitz += nrm2 def value(self, y, w, yXTw): return (yXTw ** 2).sum() / 2 - np.sum(w) @@ -327,17 +333,18 @@ def initialize(self, X, y): self.lipschitz[j] = (X[:, j] ** 2).sum() / len(y) self.global_lipschitz += (X[:, j] ** 2).sum() / len(y) - def initialize_sparse( - self, X_data, X_indptr, X_indices, y): + def initialize_sparse(self, X_data, X_indptr, X_indices, y): n_features = len(X_indptr) - 1 + + self.global_lipschitz = spectral_norm2(X_data, X_indptr, X_indices, len(y)) + self.global_lipschitz /= len(y) + self.lipschitz = np.zeros(n_features, dtype=X_data.dtype) - self.global_lipschitz = 0. for j in range(n_features): nrm2 = 0. for idx in range(X_indptr[j], X_indptr[j + 1]): nrm2 += X_data[idx] ** 2 self.lipschitz[j] = nrm2 / len(y) - self.global_lipschitz += nrm2 / len(y) def value(self, y, w, Xw): n_samples = len(y) diff --git a/skglm/utils.py b/skglm/utils.py index 75c993c57..4f83c8788 100644 --- a/skglm/utils.py +++ b/skglm/utils.py @@ -493,7 +493,7 @@ def spectral_norm2(X_data, X_indptr, X_indices, n_samples, Returns ------- eigenvalue : float - The largest eigenvalue value of ``X.T @ X``, aka the squared spectral of ``X``. + The largest eigenvalue of ``X.T @ X``, aka the squared spectral norm of ``X``. References ---------- From 2bbc8f5394bccae0ce896d2a3dd8870409a337a1 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Fri, 21 Oct 2022 11:46:13 +0200 Subject: [PATCH 23/29] fix && refactor unittest --- skglm/tests/test_fista.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py index 617c71848..528fe690a 100644 --- a/skglm/tests/test_fista.py +++ b/skglm/tests/test_fista.py @@ -2,23 +2,27 @@ import numpy as np from numpy.linalg import norm + +import scipy.sparse from scipy.sparse import csc_matrix, issparse -from skglm.datafits import Quadratic, Logistic, QuadraticSVC from skglm.penalties import L1, IndicatorBox from skglm.solvers import FISTA, AndersonCD +from skglm.datafits import Quadratic, Logistic, QuadraticSVC from skglm.utils import make_correlated_data, compiled_clone, spectral_norm2 -np.random.seed(0) +random_state = 113 n_samples, n_features = 50, 60 -X, y, _ = make_correlated_data( - n_samples=n_samples, n_features=n_features, random_state=0) -X_sparse = csc_matrix(X * np.random.binomial(1, 0.1, X.shape)) + +rng = np.random.RandomState(random_state) +X, y, _ = make_correlated_data(n_samples, n_features, random_state=rng) +rng.seed(random_state) +X_sparse = csc_matrix(X * np.random.binomial(1, 0.5, X.shape)) y_classif = np.sign(y) alpha_max = norm(X.T @ y, ord=np.inf) / len(y) -alpha = alpha_max / 100 +alpha = alpha_max / 10 tol = 1e-10 @@ -40,20 +44,23 @@ def test_fista_solver(X, Datafit, Penalty): penalty = compiled_clone(Penalty(alpha)) solver = FISTA(max_iter=1000, tol=tol) - res_fista = solver.solve(X, _y, datafit, penalty) + w_fista = solver.solve(X, _y, datafit, penalty)[0] solver_cd = AndersonCD(tol=tol, fit_intercept=False) - res_cd = solver_cd.solve(X, _y, datafit, penalty) + w_cd = solver_cd.solve(X, _y, datafit, penalty)[0] - np.testing.assert_allclose(res_fista[0], res_cd[0], rtol=1e-3) + np.testing.assert_allclose(w_fista, w_cd, atol=1e-7) def test_spectral_norm2(): - X_bundles = (X_sparse.data, X_sparse.indptr, X_sparse.indices) - lipschitz_our = spectral_norm2(*X_bundles, n_samples=len(y)) - lipschitz_np = norm(X_sparse.toarray(), ord=2) ** 2 + n_samples, n_features = 50, 60 + A_sparse = scipy.sparse.random(n_samples, n_features, density=0.5, format='csc') + + A_bundles = (A_sparse.data, A_sparse.indptr, A_sparse.indices) + lipschitz_our = spectral_norm2(*A_bundles, n_samples=len(y)) + lipschitz_np = norm(A_sparse.toarray(), ord=2) ** 2 - np.testing.assert_allclose(lipschitz_our, lipschitz_np, atol=1e-4, rtol=1e-5) + np.testing.assert_allclose(lipschitz_our, lipschitz_np) if __name__ == '__main__': From ed3686a9dde61dff7ff62aa6733c5f37e7f432b1 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Fri, 21 Oct 2022 11:50:20 +0200 Subject: [PATCH 24/29] add docs for tol and max_iter && clean ups --- skglm/utils.py | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/skglm/utils.py b/skglm/utils.py index 4f83c8788..4e563bad0 100644 --- a/skglm/utils.py +++ b/skglm/utils.py @@ -490,6 +490,12 @@ def spectral_norm2(X_data, X_indptr, X_indices, n_samples, n_samples : int number of rows of ``X``. + max_iter : int, default 20 + Maximum number of power method iterations. + + tol : float, default 1e-6 + Tolerance for convergence. + Returns ------- eigenvalue : float @@ -561,24 +567,4 @@ def _XT_dot_vec(X_data, X_indptr, X_indices, vec): if __name__ == '__main__': - from scipy.sparse import csc_matrix, random - import time - n_samples, n_features = 500, 600 - A = random(n_samples, n_features, density=0.5, format='csc') - - X = csc_matrix(A) - X_dense = X.toarray() - - # cache numba compilation - M = random(5, 7, density=0.9, format='csc') - spectral_norm2(M.data, M.indptr, M.indices, 5) - - start = time.time() - spectral_norm2(X.data, X.indptr, X.indices, n_samples) - end = time.time() - print("our: ", end - start) - - start = time.time() - norm(X_dense, ord=2) ** 2 - end = time.time() - print("np: ", end - start) + pass From aa15c46c26d9c233a35d4319d5449d39b9842867 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 21 Oct 2022 13:53:51 +0200 Subject: [PATCH 25/29] remove square form spectral norm --- skglm/datafits/single_task.py | 12 +++++------ skglm/tests/test_fista.py | 14 +++++++------ skglm/utils.py | 38 +++++++++++++++++++++++++++-------- 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index c2bd741ff..63047f6fe 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -4,7 +4,7 @@ from numba import float64 from skglm.datafits.base import BaseDatafit -from skglm.utils import spectral_norm2 +from skglm.utils import spectral_norm class Quadratic(BaseDatafit): @@ -59,7 +59,7 @@ def initialize_sparse(self, X_data, X_indptr, X_indices, y): n_features = len(X_indptr) - 1 self.Xty = np.zeros(n_features, dtype=X_data.dtype) - self.global_lipschitz = spectral_norm2(X_data, X_indptr, X_indices, len(y)) + self.global_lipschitz = spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2 self.global_lipschitz /= len(y) self.lipschitz = np.zeros(n_features, dtype=X_data.dtype) @@ -160,7 +160,7 @@ def initialize(self, X, y): def initialize_sparse(self, X_data, X_indptr, X_indices, y): n_features = len(X_indptr) - 1 - self.global_lipschitz = spectral_norm2(X_data, X_indptr, X_indices, len(y)) + self.global_lipschitz = spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2 self.global_lipschitz /= (len(y) * 4) self.lipschitz = np.zeros(n_features, dtype=X_data.dtype) @@ -242,8 +242,8 @@ def initialize(self, yXT, y): def initialize_sparse(self, yXT_data, yXT_indptr, yXT_indices, y): n_features = len(yXT_indptr) - 1 - self.global_lipschitz = spectral_norm2( - yXT_data, yXT_indptr, yXT_indices, len(y)) + self.global_lipschitz = spectral_norm( + yXT_data, yXT_indptr, yXT_indices, len(y)) ** 2 self.lipschitz = np.zeros(n_features, dtype=yXT_data.dtype) for j in range(n_features): @@ -336,7 +336,7 @@ def initialize(self, X, y): def initialize_sparse(self, X_data, X_indptr, X_indices, y): n_features = len(X_indptr) - 1 - self.global_lipschitz = spectral_norm2(X_data, X_indptr, X_indices, len(y)) + self.global_lipschitz = spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2 self.global_lipschitz /= len(y) self.lipschitz = np.zeros(n_features, dtype=X_data.dtype) diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py index 528fe690a..559c5d9ca 100644 --- a/skglm/tests/test_fista.py +++ b/skglm/tests/test_fista.py @@ -4,12 +4,13 @@ from numpy.linalg import norm import scipy.sparse +import scipy.sparse.linalg from scipy.sparse import csc_matrix, issparse from skglm.penalties import L1, IndicatorBox from skglm.solvers import FISTA, AndersonCD from skglm.datafits import Quadratic, Logistic, QuadraticSVC -from skglm.utils import make_correlated_data, compiled_clone, spectral_norm2 +from skglm.utils import make_correlated_data, compiled_clone, spectral_norm random_state = 113 @@ -52,15 +53,16 @@ def test_fista_solver(X, Datafit, Penalty): np.testing.assert_allclose(w_fista, w_cd, atol=1e-7) -def test_spectral_norm2(): +def test_spectral_norm(): n_samples, n_features = 50, 60 - A_sparse = scipy.sparse.random(n_samples, n_features, density=0.5, format='csc') + A_sparse = scipy.sparse.random(n_samples, n_features, density=0.7, format='csc', + random_state=random_state) A_bundles = (A_sparse.data, A_sparse.indptr, A_sparse.indices) - lipschitz_our = spectral_norm2(*A_bundles, n_samples=len(y)) - lipschitz_np = norm(A_sparse.toarray(), ord=2) ** 2 + spectral_norm_our = spectral_norm(*A_bundles, n_samples=len(y)) + spectral_norm_sp = scipy.sparse.linalg.svds(A_sparse, k=1)[1] - np.testing.assert_allclose(lipschitz_our, lipschitz_np) + np.testing.assert_allclose(spectral_norm_our, spectral_norm_sp) if __name__ == '__main__': diff --git a/skglm/utils.py b/skglm/utils.py index 4e563bad0..c66185ef4 100644 --- a/skglm/utils.py +++ b/skglm/utils.py @@ -470,22 +470,22 @@ def _prox_vec(w, z, penalty, lipschitz): @njit -def spectral_norm2(X_data, X_indptr, X_indices, n_samples, - max_iter=20, tol=1e-6): - """Compute the squared spectral norm of sparse matrix ``X``. +def spectral_norm(X_data, X_indptr, X_indices, n_samples, + max_iter=20, tol=1e-6): + """Compute the spectral norm of sparse matrix ``X``. Find the largest eigenvalue of ``X @ X.T`` using the power method. Parameters ---------- X_data : array, shape (n_elements,) - `data` attribute of the sparse CSC matrix ``X``. + ``data`` attribute of the sparse CSC matrix ``X``. X_indptr : array, shape (n_features + 1,) - `indptr` attribute of the sparse CSC matrix ``X``. + ``indptr`` attribute of the sparse CSC matrix ``X``. X_indices : array, shape (n_elements,) - `indices` attribute of the sparse CSC matrix ``X``. + ``indices`` attribute of the sparse CSC matrix ``X``. n_samples : int number of rows of ``X``. @@ -499,7 +499,7 @@ def spectral_norm2(X_data, X_indptr, X_indices, n_samples, Returns ------- eigenvalue : float - The largest eigenvalue of ``X.T @ X``, aka the squared spectral norm of ``X``. + The largest singular value of ``X``. References ---------- @@ -526,7 +526,7 @@ def spectral_norm2(X_data, X_indptr, X_indices, n_samples, eigenvector = vec / norm_vec - return eigenvalue + return np.sqrt(eigenvalue) @njit @@ -567,4 +567,26 @@ def _XT_dot_vec(X_data, X_indptr, X_indices, vec): if __name__ == '__main__': + import time + import scipy.sparse.linalg + from scipy.sparse import csc_matrix, random + n_samples, n_features = 500, 600 + A = random(n_samples, n_features, density=0.5, format='csc') + + X = csc_matrix(A) + X_dense = X.toarray() + + # cache numba compilation + M = random(5, 7, density=0.9, format='csc') + spectral_norm(M.data, M.indptr, M.indices, 5) + + start = time.time() + spectral_norm(X.data, X.indptr, X.indices, n_samples) + end = time.time() + print("our: ", end - start) + + start = time.time() + scipy.sparse.linalg.svds(X, k=1)[1] + end = time.time() + print("scipy: ", end - start) pass From 27b918dca8c66bf4405645661e9191dee4f9e2f2 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 21 Oct 2022 14:11:32 +0200 Subject: [PATCH 26/29] refactor ``_prox_vec`` function --- skglm/solvers/fista.py | 8 ++++---- skglm/utils.py | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index 417af1a7e..bbdc451b7 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -8,8 +8,6 @@ class FISTA(BaseSolver): r"""ISTA solver with Nesterov acceleration (FISTA). - This solver implements accelerated proximal gradient descent for linear problems. - Attributes ---------- max_iter : int, default 100 @@ -59,8 +57,10 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): X.data, X.indptr, X.indices, y, z, X @ z, datafit, all_features) else: grad = construct_grad(X, y, z, X @ z, datafit, all_features) - z -= grad / lipschitz - w = _prox_vec(w, z, penalty, lipschitz) + + step = 1 / lipschitz + z -= step * grad + w = _prox_vec(w, z, penalty, step) Xw = X @ w z = w + (t_old - 1.) / t_new * (w - w_old) diff --git a/skglm/utils.py b/skglm/utils.py index c66185ef4..b6ba50a50 100644 --- a/skglm/utils.py +++ b/skglm/utils.py @@ -460,12 +460,12 @@ def extrapolate(self, w, Xw): @njit -def _prox_vec(w, z, penalty, lipschitz): - # evaluate the vectorized proximal operator for the FISTA solver - # lipschitz stands for global lipschitz constant +def _prox_vec(w, z, penalty, step): + # evaluate the vectorized proximal operator + # lipchitz stands for global lipchitz constant n_features = w.shape[0] for j in range(n_features): - w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j) + w[j] = penalty.prox_1d(z[j], step, j) return w @@ -474,7 +474,7 @@ def spectral_norm(X_data, X_indptr, X_indices, n_samples, max_iter=20, tol=1e-6): """Compute the spectral norm of sparse matrix ``X``. - Find the largest eigenvalue of ``X @ X.T`` using the power method. + Apply The power method on ``X @ X.T`` to find its largest eigenvalue. Parameters ---------- From 9d8e3c0d1ab92f9cd631b02241a7cd832c0b771e Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 21 Oct 2022 17:48:17 +0200 Subject: [PATCH 27/29] fix bug segmentation fault --- skglm/datafits/single_task.py | 2 +- skglm/tests/test_estimators.py | 8 ++------ skglm/utils.py | 25 +------------------------ 3 files changed, 4 insertions(+), 31 deletions(-) diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 63047f6fe..09b19beb1 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -243,7 +243,7 @@ def initialize_sparse(self, yXT_data, yXT_indptr, yXT_indices, y): n_features = len(yXT_indptr) - 1 self.global_lipschitz = spectral_norm( - yXT_data, yXT_indptr, yXT_indices, len(y)) ** 2 + yXT_data, yXT_indptr, yXT_indices, max(yXT_indices)+1) ** 2 self.lipschitz = np.zeros(n_features, dtype=yXT_data.dtype) for j in range(n_features): diff --git a/skglm/tests/test_estimators.py b/skglm/tests/test_estimators.py index 348c5ef67..9e1e1cd71 100644 --- a/skglm/tests/test_estimators.py +++ b/skglm/tests/test_estimators.py @@ -25,12 +25,8 @@ from skglm.solvers import AndersonCD -n_samples = 50 -n_tasks = 9 -n_features = 60 -X, Y, _ = make_correlated_data( - n_samples=n_samples, n_features=n_features, n_tasks=n_tasks, density=0.1, - random_state=0) +n_samples, n_features, n_tasks = 50, 60, 9 +X, Y, _ = make_correlated_data(n_samples, n_features, n_tasks, random_state=0) y = Y[:, 0] np.random.seed(0) diff --git a/skglm/utils.py b/skglm/utils.py index b6ba50a50..ed982fade 100644 --- a/skglm/utils.py +++ b/skglm/utils.py @@ -533,8 +533,7 @@ def spectral_norm(X_data, X_indptr, X_indices, n_samples, def _XXT_dot_vec(X_data, X_indptr, X_indices, vec, n_samples): # computes X @ X.T @ vec, with X csc encoded return _X_dot_vec(X_data, X_indptr, X_indices, - _XT_dot_vec(X_data, X_indptr, X_indices, vec), - n_samples) + _XT_dot_vec(X_data, X_indptr, X_indices, vec), n_samples) @njit @@ -567,26 +566,4 @@ def _XT_dot_vec(X_data, X_indptr, X_indices, vec): if __name__ == '__main__': - import time - import scipy.sparse.linalg - from scipy.sparse import csc_matrix, random - n_samples, n_features = 500, 600 - A = random(n_samples, n_features, density=0.5, format='csc') - - X = csc_matrix(A) - X_dense = X.toarray() - - # cache numba compilation - M = random(5, 7, density=0.9, format='csc') - spectral_norm(M.data, M.indptr, M.indices, 5) - - start = time.time() - spectral_norm(X.data, X.indptr, X.indices, n_samples) - end = time.time() - print("our: ", end - start) - - start = time.time() - scipy.sparse.linalg.svds(X, k=1)[1] - end = time.time() - print("scipy: ", end - start) pass From e5ce21bf3704ab5d38d26b323758d888b10ad618 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 21 Oct 2022 18:01:42 +0200 Subject: [PATCH 28/29] add Fista to docs && fix unittest --- doc/api.rst | 1 + skglm/tests/test_estimators.py | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 4d003891e..d05ed8a57 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -70,6 +70,7 @@ Solvers :toctree: generated/ AndersonCD + FISTA GramCD GroupBCD MultiTaskBCD diff --git a/skglm/tests/test_estimators.py b/skglm/tests/test_estimators.py index 9e1e1cd71..348c5ef67 100644 --- a/skglm/tests/test_estimators.py +++ b/skglm/tests/test_estimators.py @@ -25,8 +25,12 @@ from skglm.solvers import AndersonCD -n_samples, n_features, n_tasks = 50, 60, 9 -X, Y, _ = make_correlated_data(n_samples, n_features, n_tasks, random_state=0) +n_samples = 50 +n_tasks = 9 +n_features = 60 +X, Y, _ = make_correlated_data( + n_samples=n_samples, n_features=n_features, n_tasks=n_tasks, density=0.1, + random_state=0) y = Y[:, 0] np.random.seed(0) From 5d2dbaf2504347701bf070f77abbe944765c705b Mon Sep 17 00:00:00 2001 From: mathurinm Date: Sat, 22 Oct 2022 16:23:58 +0200 Subject: [PATCH 29/29] cosmetic changes --- skglm/datafits/single_task.py | 2 +- skglm/utils.py | 18 ++++++------------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 09b19beb1..948cd1454 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -161,7 +161,7 @@ def initialize_sparse(self, X_data, X_indptr, X_indices, y): n_features = len(X_indptr) - 1 self.global_lipschitz = spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2 - self.global_lipschitz /= (len(y) * 4) + self.global_lipschitz /= 4 * len(y) self.lipschitz = np.zeros(n_features, dtype=X_data.dtype) for j in range(n_features): diff --git a/skglm/utils.py b/skglm/utils.py index ed982fade..2932c1822 100644 --- a/skglm/utils.py +++ b/skglm/utils.py @@ -461,8 +461,7 @@ def extrapolate(self, w, Xw): @njit def _prox_vec(w, z, penalty, step): - # evaluate the vectorized proximal operator - # lipchitz stands for global lipchitz constant + # evaluate the full proximal operator n_features = w.shape[0] for j in range(n_features): w[j] = penalty.prox_1d(z[j], step, j) @@ -471,10 +470,8 @@ def _prox_vec(w, z, penalty, step): @njit def spectral_norm(X_data, X_indptr, X_indices, n_samples, - max_iter=20, tol=1e-6): - """Compute the spectral norm of sparse matrix ``X``. - - Apply The power method on ``X @ X.T`` to find its largest eigenvalue. + max_iter=100, tol=1e-6): + """Compute the spectral norm of sparse matrix ``X`` with power method. Parameters ---------- @@ -504,11 +501,8 @@ def spectral_norm(X_data, X_indptr, X_indices, n_samples, References ---------- .. [1] Alfio Quarteroni, Riccardo Sacco, Fausto Saleri "Numerical Mathematics", - chapiter 5, page 192-195. + chapter 5, page 192-195. """ - # tol is squared as we evaluate the square of inequality (5.25) in ref [1] - tol = tol ** 2 - # init vec with norm(vec) == 1. eigenvector = np.random.randn(n_samples) eigenvector /= norm(eigenvector) @@ -520,8 +514,8 @@ def spectral_norm(X_data, X_indptr, X_indices, n_samples, eigenvalue = vec @ eigenvector # norm(X @ X.T @ eigenvector - eigenvalue * eigenvector) <= tol - # inequality is squared - if norm_vec ** 2 - eigenvalue ** 2 <= tol: + # inequality (5.25) in ref [1] is squared + if norm_vec ** 2 - eigenvalue ** 2 <= tol ** 2: break eigenvector = vec / norm_vec