From 0868b0f17c6cb593de3d1c4fb59b9b6785601015 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Wed, 12 Oct 2022 20:42:31 +0200
Subject: [PATCH 01/29] POC FISTA

---
 skglm/solvers/__init__.py |  3 +-
 skglm/solvers/fista.py    | 61 +++++++++++++++++++++++++++++++++++++++
 toy_fista.py              | 27 +++++++++++++++++
 3 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100644 skglm/solvers/fista.py
 create mode 100644 toy_fista.py

diff --git a/skglm/solvers/__init__.py b/skglm/solvers/__init__.py
index 0f8016f40..d86eb275a 100644
--- a/skglm/solvers/__init__.py
+++ b/skglm/solvers/__init__.py
@@ -1,9 +1,10 @@
 from .anderson_cd import AndersonCD
 from .base import BaseSolver
+from .fista import FISTA
 from .gram_cd import GramCD
 from .group_bcd import GroupBCD
 from .multitask_bcd import MultiTaskBCD
 from .prox_newton import ProxNewton
 
 
-__all__ = [AndersonCD, BaseSolver, GramCD, GroupBCD, MultiTaskBCD, ProxNewton]
+__all__ = [AndersonCD, BaseSolver, FISTA, GramCD, GroupBCD, MultiTaskBCD, ProxNewton]
diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
new file mode 100644
index 000000000..3aff84d0f
--- /dev/null
+++ b/skglm/solvers/fista.py
@@ -0,0 +1,61 @@
+import numpy as np
+from numba import njit
+from skglm.solvers.base import BaseSolver
+
+
+class FISTA(BaseSolver):
+    r"""ISTA solver with Nesterov acceleration (FISTA)."""
+
+    def __init__(self, max_iter=100, tol=1e-4, fit_intercept=False, warm_start=False,
+                 opt_freq=50, verbose=0):
+        self.max_iter = max_iter
+        self.tol = tol
+        self.fit_intercept = fit_intercept
+        self.warm_start = warm_start
+        self.opt_freq = opt_freq 
+        self.verbose = verbose
+
+    def solve(self, X, y, penalty, w_init=None, weights=None):
+        # needs a quadratic datafit, but works with L1, WeightedL1, SLOPE
+        n_samples, n_features = X.shape
+        all_features = np.arange(n_features)
+        t_new = 1
+
+        w = w_init.copy() if w_init is not None else np.zeros(n_features)
+        z = w_init.copy() if w_init is not None else np.zeros(n_features)
+        weights = weights if weights is not None else np.ones(n_features)
+
+        # FISTA with Gram update
+        G = X.T @ X
+        Xty = X.T @ y
+        lipschitz = np.linalg.norm(X, ord=2) ** 2 / n_samples
+
+        for n_iter in range(self.max_iter):
+            t_old = t_new
+            t_new = (1 + np.sqrt(1 + 4 * t_old ** 2)) / 2
+            w_old = w.copy()
+            grad = (G @ z - Xty) / n_samples
+            z -= grad / lipschitz
+            # TODO: TO DISCUSS!
+            # XXX: should add a full prox update
+            for j in range(n_features):
+                w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j)
+            z = w + (t_old - 1.) / t_new * (w - w_old)
+
+            if n_iter % self.opt_freq == 0:
+                opt = penalty.subdiff_distance(w, grad, all_features)
+                stop_crit = np.max(opt)
+
+                if self.verbose:
+                    p_obj = (np.sum((y - X @ w) ** 2) / (2 * n_samples)
+                             + penalty.alpha * penalty.value(w))
+                    print(
+                        f"Iteration {n_iter+1}: {p_obj:.10f}, "
+                        f"stopping crit: {stop_crit:.2e}"
+                    )
+
+                if stop_crit < self.tol:
+                    if self.verbose:
+                        print(f"Stopping criterion max violation: {stop_crit:.2e}")
+                    break
+        return w
diff --git a/toy_fista.py b/toy_fista.py
new file mode 100644
index 000000000..fb26f99be
--- /dev/null
+++ b/toy_fista.py
@@ -0,0 +1,27 @@
+import numpy as np
+from numpy.linalg import norm
+from skglm.solvers import FISTA
+from skglm.penalties import L1
+from skglm.estimators import Lasso
+from skglm.utils import make_correlated_data, compiled_clone
+
+
+X, y, _ = make_correlated_data(n_samples=200, n_features=100, random_state=24)
+
+n_samples, n_features = X.shape
+alpha_max = norm(X.T @ y, ord=np.inf) / n_samples
+
+alpha = alpha_max / 10
+
+max_iter = 1000
+obj_freq = 100
+tol = 1e-10
+
+solver = FISTA(max_iter=max_iter, tol=tol, opt_freq=obj_freq, verbose=1)
+penalty = compiled_clone(L1(alpha))
+w = solver.solve(X, y, penalty)
+
+clf = Lasso(alpha=alpha, tol=tol, fit_intercept=False)
+clf.fit(X, y)
+
+np.testing.assert_allclose(w, clf.coef_, rtol=1e-5)

From 8584299888f3f7fca22156503367c849d67ca8f7 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Fri, 14 Oct 2022 14:50:17 +0200
Subject: [PATCH 02/29] CLN

---
 skglm/solvers/fista.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index 3aff84d0f..30e449721 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -3,16 +3,25 @@
 from skglm.solvers.base import BaseSolver
 
 
+@njit
+def _prox_vec(w, z, penalty, lipschitz):
+    # XXX: TO DISCUSS: should add a vectorized prox update
+    n_features = w.shape[0]
+    for j in range(n_features):
+        w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j)
+    return w
+
+
 class FISTA(BaseSolver):
     r"""ISTA solver with Nesterov acceleration (FISTA)."""
 
     def __init__(self, max_iter=100, tol=1e-4, fit_intercept=False, warm_start=False,
-                 opt_freq=50, verbose=0):
+                 opt_freq=100, verbose=0):
         self.max_iter = max_iter
         self.tol = tol
         self.fit_intercept = fit_intercept
         self.warm_start = warm_start
-        self.opt_freq = opt_freq 
+        self.opt_freq = opt_freq
         self.verbose = verbose
 
     def solve(self, X, y, penalty, w_init=None, weights=None):
@@ -36,10 +45,7 @@ def solve(self, X, y, penalty, w_init=None, weights=None):
             w_old = w.copy()
             grad = (G @ z - Xty) / n_samples
             z -= grad / lipschitz
-            # TODO: TO DISCUSS!
-            # XXX: should add a full prox update
-            for j in range(n_features):
-                w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j)
+            w = _prox_vec(w, z, penalty, lipschitz)
             z = w + (t_old - 1.) / t_new * (w - w_old)
 
             if n_iter % self.opt_freq == 0:
@@ -48,7 +54,7 @@ def solve(self, X, y, penalty, w_init=None, weights=None):
 
                 if self.verbose:
                     p_obj = (np.sum((y - X @ w) ** 2) / (2 * n_samples)
-                             + penalty.alpha * penalty.value(w))
+                             + penalty.value(w))
                     print(
                         f"Iteration {n_iter+1}: {p_obj:.10f}, "
                         f"stopping crit: {stop_crit:.2e}"

From c82e32eb1b5af1d389d68d3dfad3a38cd8169378 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Fri, 14 Oct 2022 15:09:26 +0200
Subject: [PATCH 03/29] changed obj_freq from 100 to 10

---
 skglm/solvers/fista.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index 30e449721..226ed4f89 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -5,7 +5,6 @@
 
 @njit
 def _prox_vec(w, z, penalty, lipschitz):
-    # XXX: TO DISCUSS: should add a vectorized prox update
     n_features = w.shape[0]
     for j in range(n_features):
         w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j)
@@ -16,7 +15,7 @@ class FISTA(BaseSolver):
     r"""ISTA solver with Nesterov acceleration (FISTA)."""
 
     def __init__(self, max_iter=100, tol=1e-4, fit_intercept=False, warm_start=False,
-                 opt_freq=100, verbose=0):
+                 opt_freq=10, verbose=0):
         self.max_iter = max_iter
         self.tol = tol
         self.fit_intercept = fit_intercept

From 4940a0d2cff4208b73211b20f4c39b670b8a159a Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Fri, 14 Oct 2022 15:48:35 +0200
Subject: [PATCH 04/29] WIP Lipschitz

---
 skglm/solvers/fista.py | 17 ++++++++---------
 toy_fista.py           |  5 ++++-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index 226ed4f89..336e65450 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -1,6 +1,7 @@
 import numpy as np
 from numba import njit
 from skglm.solvers.base import BaseSolver
+from skglm.solvers.common import construct_grad
 
 
 @njit
@@ -23,28 +24,27 @@ def __init__(self, max_iter=100, tol=1e-4, fit_intercept=False, warm_start=False
         self.opt_freq = opt_freq
         self.verbose = verbose
 
-    def solve(self, X, y, penalty, w_init=None, weights=None):
-        # needs a quadratic datafit, but works with L1, WeightedL1, SLOPE
+    def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
         n_samples, n_features = X.shape
         all_features = np.arange(n_features)
         t_new = 1
 
         w = w_init.copy() if w_init is not None else np.zeros(n_features)
         z = w_init.copy() if w_init is not None else np.zeros(n_features)
-        weights = weights if weights is not None else np.ones(n_features)
+        Xw = Xw_init.copy() if Xw_init is not None else np.zeros(n_samples)
 
-        # FISTA with Gram update
-        G = X.T @ X
-        Xty = X.T @ y
+        # line search?
+        # lipschitz = np.max(datafit.lipschitz)
         lipschitz = np.linalg.norm(X, ord=2) ** 2 / n_samples
 
         for n_iter in range(self.max_iter):
             t_old = t_new
             t_new = (1 + np.sqrt(1 + 4 * t_old ** 2)) / 2
             w_old = w.copy()
-            grad = (G @ z - Xty) / n_samples
+            grad = construct_grad(X, y, z, X @ z, datafit, all_features)
             z -= grad / lipschitz
             w = _prox_vec(w, z, penalty, lipschitz)
+            Xw = X @ w
             z = w + (t_old - 1.) / t_new * (w - w_old)
 
             if n_iter % self.opt_freq == 0:
@@ -52,8 +52,7 @@ def solve(self, X, y, penalty, w_init=None, weights=None):
                 stop_crit = np.max(opt)
 
                 if self.verbose:
-                    p_obj = (np.sum((y - X @ w) ** 2) / (2 * n_samples)
-                             + penalty.value(w))
+                    p_obj = datafit.value(y, w, Xw) + penalty.value(w)
                     print(
                         f"Iteration {n_iter+1}: {p_obj:.10f}, "
                         f"stopping crit: {stop_crit:.2e}"
diff --git a/toy_fista.py b/toy_fista.py
index fb26f99be..476ed578d 100644
--- a/toy_fista.py
+++ b/toy_fista.py
@@ -1,5 +1,6 @@
 import numpy as np
 from numpy.linalg import norm
+from skglm.datafits.single_task import Quadratic
 from skglm.solvers import FISTA
 from skglm.penalties import L1
 from skglm.estimators import Lasso
@@ -18,8 +19,10 @@
 tol = 1e-10
 
 solver = FISTA(max_iter=max_iter, tol=tol, opt_freq=obj_freq, verbose=1)
+datafit = compiled_clone(Quadratic())
+datafit.initialize(X, y)
 penalty = compiled_clone(L1(alpha))
-w = solver.solve(X, y, penalty)
+w = solver.solve(X, y, datafit, penalty)
 
 clf = Lasso(alpha=alpha, tol=tol, fit_intercept=False)
 clf.fit(X, y)

From e47c68a91532be49aa7f975cb782f5b05dcae894 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Fri, 14 Oct 2022 17:03:49 +0200
Subject: [PATCH 05/29] ADD global lipschitz constants

---
 skglm/datafits/single_task.py | 45 +++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
index 1f4d39503..4aefaac0a 100644
--- a/skglm/datafits/single_task.py
+++ b/skglm/datafits/single_task.py
@@ -22,6 +22,10 @@ class Quadratic(BaseDatafit):
         The coordinatewise gradient Lipschitz constants. Equal to
         norm(X, axis=0) ** 2 / n_samples.
 
+    global_lipschitz : float
+        Global Lipschitz constant. Equal to
+        norm(X, ord=2) ** 2 / n_samples.
+
     Note
     ----
     The class is jit compiled at fit time using Numba compiler.
@@ -35,6 +39,7 @@ def get_spec(self):
         spec = (
             ('Xty', float64[:]),
             ('lipschitz', float64[:]),
+            ('global_lipschitz', float64),
         )
         return spec
 
@@ -44,6 +49,7 @@ def params_to_dict(self):
     def initialize(self, X, y):
         self.Xty = X.T @ y
         n_features = X.shape[1]
+        self.global_lipschitz = norm(X, ord=2) ** 2 / len(y)
         self.lipschitz = np.zeros(n_features, dtype=X.dtype)
         for j in range(n_features):
             self.lipschitz[j] = (X[:, j] ** 2).sum() / len(y)
@@ -53,15 +59,19 @@ def initialize_sparse(
         n_features = len(X_indptr) - 1
         self.Xty = np.zeros(n_features, dtype=X_data.dtype)
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
+        self.global_lipschitz = 0.
         for j in range(n_features):
             nrm2 = 0.
             xty = 0
+            x2 = 0.
             for idx in range(X_indptr[j], X_indptr[j + 1]):
                 nrm2 += X_data[idx] ** 2
                 xty += X_data[idx] * y[X_indices[idx]]
+                x2 += X_data[idx] ** 2 / len(y)
 
             self.lipschitz[j] = nrm2 / len(y)
             self.Xty[j] = xty
+            self.global_lipschitz += x2
 
     def value(self, y, w, Xw):
         return np.sum((y - Xw) ** 2) / (2 * len(Xw))
@@ -111,6 +121,10 @@ class Logistic(BaseDatafit):
         The coordinatewise gradient Lipschitz constants. Equal to
         norm(X, axis=0) ** 2 / (4 * n_samples).
 
+    global_lipschitz : float
+        Global Lipschitz constant. Equal to
+        norm(X, ord=2) ** 2 / (4 * n_samples).
+
     Note
     ----
     The class is jit compiled at fit time using Numba compiler.
@@ -123,6 +137,7 @@ def __init__(self):
     def get_spec(self):
         spec = (
             ('lipschitz', float64[:]),
+            ('global_lipschitz', float64),
         )
         return spec
 
@@ -140,13 +155,16 @@ def raw_hessian(self, y, Xw):
 
     def initialize(self, X, y):
         self.lipschitz = (X ** 2).sum(axis=0) / (len(y) * 4)
+        self.global_lipschitz = norm(X, ord=2) ** 2 / (len(y) * 4)
 
     def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
+        self.global_lipschitz = 0.
         for j in range(n_features):
             Xj = X_data[X_indptr[j]:X_indptr[j+1]]
             self.lipschitz[j] = (Xj ** 2).sum() / (len(y) * 4)
+            self.global_lipschitz += (Xj ** 2).sum() / (len(y) * 4)
 
     def value(self, y, w, Xw):
         return np.log(1. + np.exp(- y * Xw)).sum() / len(y)
@@ -187,6 +205,11 @@ class QuadraticSVC(BaseDatafit):
     ----------
     lipschitz : array, shape (n_features,)
         The coordinatewise gradient Lipschitz constants.
+        Equal to norm(yXT, axis=0) ** 2.
+
+    global_lipschitz : float
+        Global Lipschitz constant. Equal to
+        norm(yXT, ord=2) ** 2.
 
     Note
     ----
@@ -200,6 +223,7 @@ def __init__(self):
     def get_spec(self):
         spec = (
             ('lipschitz', float64[:]),
+            ('global_lipschitz', float64),
         )
         return spec
 
@@ -209,18 +233,22 @@ def params_to_dict(self):
     def initialize(self, yXT, y):
         n_features = yXT.shape[1]
         self.lipschitz = np.zeros(n_features, dtype=yXT.dtype)
+        self.global_lipschitz = 0.
         for j in range(n_features):
             self.lipschitz[j] = norm(yXT[:, j]) ** 2
+            self.global_lipschitz += norm(yXT[:, j]) ** 2
 
     def initialize_sparse(
             self, yXT_data, yXT_indptr, yXT_indices, y):
         n_features = len(yXT_indptr) - 1
         self.lipschitz = np.zeros(n_features, dtype=yXT_data.dtype)
+        self.global_lipschitz = 0.
         for j in range(n_features):
             nrm2 = 0.
             for idx in range(yXT_indptr[j], yXT_indptr[j + 1]):
                 nrm2 += yXT_data[idx] ** 2
             self.lipschitz[j] = nrm2
+            self.global_lipschitz += nrm2
 
     def value(self, y, w, yXTw):
         return (yXTw ** 2).sum() / 2 - np.sum(w)
@@ -264,8 +292,16 @@ class Huber(BaseDatafit):
 
     Attributes
     ----------
+    delta : float
+        Shape hyperparameter.
+
     lipschitz : array, shape (n_features,)
-        The coordinatewise gradient Lipschitz constants.
+        The coordinatewise gradient Lipschitz constants. Equal to
+        norm(X, axis=0) ** 2 / n_samples.
+
+    global_lipschitz : float
+        Global Lipschitz constant. Equal to
+        norm(X, ord=2) ** 2 / n_samples.
 
     Note
     ----
@@ -279,7 +315,8 @@ def __init__(self, delta):
     def get_spec(self):
         spec = (
             ('delta', float64),
-            ('lipschitz', float64[:])
+            ('lipschitz', float64[:]),
+            ('global_lipschitz', float64),
         )
         return spec
 
@@ -289,18 +326,22 @@ def params_to_dict(self):
     def initialize(self, X, y):
         n_features = X.shape[1]
         self.lipschitz = np.zeros(n_features, dtype=X.dtype)
+        self.global_lipschitz = 0.
         for j in range(n_features):
             self.lipschitz[j] = (X[:, j] ** 2).sum() / len(y)
+            self.global_lipschitz += (X[:, j] ** 2).sum() / len(y)
 
     def initialize_sparse(
             self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
+        self.global_lipschitz = 0.
         for j in range(n_features):
             nrm2 = 0.
             for idx in range(X_indptr[j], X_indptr[j + 1]):
                 nrm2 += X_data[idx] ** 2
             self.lipschitz[j] = nrm2 / len(y)
+            self.global_lipschitz += nrm2 / len(y)
 
     def value(self, y, w, Xw):
         n_samples = len(y)

From 3635f241c31ac851da09d9c04cfbb17f9b7051c7 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Fri, 14 Oct 2022 17:10:49 +0200
Subject: [PATCH 06/29] FISTA with global lipschitz

---
 skglm/solvers/fista.py |  5 ++---
 toy_fista.py           | 25 +++++++++++++++++++++++--
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index 336e65450..8b9cebeeb 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -33,9 +33,8 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
         z = w_init.copy() if w_init is not None else np.zeros(n_features)
         Xw = Xw_init.copy() if Xw_init is not None else np.zeros(n_samples)
 
-        # line search?
-        # lipschitz = np.max(datafit.lipschitz)
-        lipschitz = np.linalg.norm(X, ord=2) ** 2 / n_samples
+        # TODO: OR line search
+        lipschitz = datafit.global_lipschitz
 
         for n_iter in range(self.max_iter):
             t_old = t_new
diff --git a/toy_fista.py b/toy_fista.py
index 476ed578d..1c29bf713 100644
--- a/toy_fista.py
+++ b/toy_fista.py
@@ -1,9 +1,9 @@
 import numpy as np
 from numpy.linalg import norm
-from skglm.datafits.single_task import Quadratic
+from skglm.datafits.single_task import Quadratic, Logistic
 from skglm.solvers import FISTA
 from skglm.penalties import L1
-from skglm.estimators import Lasso
+from skglm.estimators import SparseLogisticRegression, Lasso
 from skglm.utils import make_correlated_data, compiled_clone
 
 
@@ -19,6 +19,10 @@
 tol = 1e-10
 
 solver = FISTA(max_iter=max_iter, tol=tol, opt_freq=obj_freq, verbose=1)
+
+##############
+# Quadratic
+##############
 datafit = compiled_clone(Quadratic())
 datafit.initialize(X, y)
 penalty = compiled_clone(L1(alpha))
@@ -28,3 +32,20 @@
 clf.fit(X, y)
 
 np.testing.assert_allclose(w, clf.coef_, rtol=1e-5)
+
+##############
+# Logistic
+##############
+y = np.sign(y)
+alpha_max = norm(X.T @ y, ord=np.inf) / (4 * n_samples)
+alpha = alpha_max / 10
+
+datafit = compiled_clone(Logistic())
+datafit.initialize(X, y)
+penalty = compiled_clone(L1(alpha))
+w = solver.solve(X, y, datafit, penalty)
+
+clf = SparseLogisticRegression(alpha=alpha, tol=tol, fit_intercept=False)
+clf.fit(X, y)
+
+np.testing.assert_allclose(w, np.squeeze(clf.coef_), rtol=1e-3)

From 488011236c5b411a57572cb2544e16a023b15454 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Fri, 14 Oct 2022 21:25:15 +0200
Subject: [PATCH 07/29] writing tests

---
 skglm/datafits/single_task.py |  7 ++----
 skglm/solvers/fista.py        | 38 ++++++++++++++++++++++------
 skglm/tests/test_fista.py     | 47 +++++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 12 deletions(-)
 create mode 100644 skglm/tests/test_fista.py

diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
index 4aefaac0a..583312083 100644
--- a/skglm/datafits/single_task.py
+++ b/skglm/datafits/single_task.py
@@ -63,15 +63,13 @@ def initialize_sparse(
         for j in range(n_features):
             nrm2 = 0.
             xty = 0
-            x2 = 0.
             for idx in range(X_indptr[j], X_indptr[j + 1]):
                 nrm2 += X_data[idx] ** 2
                 xty += X_data[idx] * y[X_indices[idx]]
-                x2 += X_data[idx] ** 2 / len(y)
 
             self.lipschitz[j] = nrm2 / len(y)
             self.Xty[j] = xty
-            self.global_lipschitz += x2
+            self.global_lipschitz += nrm2 / len(y)
 
     def value(self, y, w, Xw):
         return np.sum((y - Xw) ** 2) / (2 * len(Xw))
@@ -233,10 +231,9 @@ def params_to_dict(self):
     def initialize(self, yXT, y):
         n_features = yXT.shape[1]
         self.lipschitz = np.zeros(n_features, dtype=yXT.dtype)
-        self.global_lipschitz = 0.
+        self.global_lipschitz = norm(yXT, ord=2) ** 2 / len(y)
         for j in range(n_features):
             self.lipschitz[j] = norm(yXT[:, j]) ** 2
-            self.global_lipschitz += norm(yXT[:, j]) ** 2
 
     def initialize_sparse(
             self, yXT_data, yXT_indptr, yXT_indices, y):
diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index 8b9cebeeb..aaafca104 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -13,14 +13,35 @@ def _prox_vec(w, z, penalty, lipschitz):
 
 
 class FISTA(BaseSolver):
-    r"""ISTA solver with Nesterov acceleration (FISTA)."""
+    r"""ISTA solver with Nesterov acceleration (FISTA).
 
-    def __init__(self, max_iter=100, tol=1e-4, fit_intercept=False, warm_start=False,
-                 opt_freq=10, verbose=0):
+    This solver implements accelerated proximal gradient descent for linear problems.
+
+    Attributes
+    ----------
+    max_iter : int, default 100
+        Maximum number of iterations.
+
+    tol : float, default 1e-4
+        Tolerance for convergence.
+
+    opt_freq : int, default 10
+        Frequency for optimality condition check.
+
+    verbose : bool, default False
+        Amount of verbosity. 0/False is silent.
+
+    References
+    ----------
+    .. [1] Beck, A. and Teboulle M.
+           "A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse
+           problems", 2009, SIAM J. Imaging Sci.
+           https://epubs.siam.org/doi/10.1137/080716542
+    """
+
+    def __init__(self, max_iter=100, tol=1e-4, opt_freq=10, verbose=0):
         self.max_iter = max_iter
         self.tol = tol
-        self.fit_intercept = fit_intercept
-        self.warm_start = warm_start
         self.opt_freq = opt_freq
         self.verbose = verbose
 
@@ -33,8 +54,11 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
         z = w_init.copy() if w_init is not None else np.zeros(n_features)
         Xw = Xw_init.copy() if Xw_init is not None else np.zeros(n_samples)
 
-        # TODO: OR line search
-        lipschitz = datafit.global_lipschitz
+        if hasattr(datafit, "global_lipschitz"):
+            lipschitz = datafit.global_lipschitz
+        else:
+            # TODO: OR line search
+            raise Exception("Line search is not yet implemented for FISTA solver.")
 
         for n_iter in range(self.max_iter):
             t_old = t_new
diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py
new file mode 100644
index 000000000..e921c146f
--- /dev/null
+++ b/skglm/tests/test_fista.py
@@ -0,0 +1,47 @@
+import pytest
+
+import numpy as np
+from numpy.linalg import norm
+
+from skglm.datafits import Quadratic, Logistic, QuadraticSVC
+from skglm.estimators import Lasso, LinearSVC, SparseLogisticRegression
+from skglm.penalties import L1, IndicatorBox
+from skglm.solvers import FISTA
+from skglm.utils import make_correlated_data, compiled_clone
+
+
+n_samples, n_features = 10, 20
+X, y, _ = make_correlated_data(
+    n_samples=n_samples, n_features=n_features, random_state=0)
+y_classif = np.sign(y)
+
+alpha_max = norm(X.T @ y, ord=np.inf) / len(y)
+alpha = alpha_max / 100
+
+tol = 1e-8
+
+# TODO: use GeneralizedLinearEstimator (to test global lipschtiz constants of every datafit)
+# TODO: test sparse matrices (global lipschitz constants)
+@pytest.mark.parametrize("Datafit, Penalty, Estimator", [
+    (Quadratic, L1, Lasso),
+    (Logistic, L1, SparseLogisticRegression),
+    (QuadraticSVC, IndicatorBox, LinearSVC),
+])
+def test_fista_solver(Datafit, Penalty, Estimator):
+    _y = y if isinstance(Datafit, Quadratic) else y_classif
+    datafit = compiled_clone(Datafit())
+    _init = y @ X.T if isinstance(Datafit, QuadraticSVC) else X
+    datafit.initialize(_init, _y)
+    penalty = compiled_clone(Penalty(alpha))
+
+    solver = FISTA(max_iter=1000, tol=tol)
+    w = solver.solve(X, _y, datafit, penalty)
+
+    estimator = Estimator(alpha, tol=tol, fit_intercept=False)
+    estimator.fit(X, _y)
+
+    np.testing.assert_allclose(w, estimator.coef_.flatten(), rtol=1e-3)
+
+
+if __name__ == '__main__':
+    pass

From 46a9a76fe3d88b1779729352013f5d5be053cf67 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Fri, 14 Oct 2022 22:20:07 +0200
Subject: [PATCH 08/29] better tests

---
 skglm/tests/test_fista.py | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py
index e921c146f..04c1ed516 100644
--- a/skglm/tests/test_fista.py
+++ b/skglm/tests/test_fista.py
@@ -2,45 +2,50 @@
 
 import numpy as np
 from numpy.linalg import norm
+from scipy.sparse import csc_matrix, issparse
 
 from skglm.datafits import Quadratic, Logistic, QuadraticSVC
-from skglm.estimators import Lasso, LinearSVC, SparseLogisticRegression
 from skglm.penalties import L1, IndicatorBox
-from skglm.solvers import FISTA
+from skglm.solvers import FISTA, AndersonCD
 from skglm.utils import make_correlated_data, compiled_clone
 
 
-n_samples, n_features = 10, 20
+np.random.seed(0)
+n_samples, n_features = 50, 60
 X, y, _ = make_correlated_data(
     n_samples=n_samples, n_features=n_features, random_state=0)
+X_sparse = csc_matrix(X * np.random.binomial(1, 0.1, X.shape))
 y_classif = np.sign(y)
 
 alpha_max = norm(X.T @ y, ord=np.inf) / len(y)
 alpha = alpha_max / 100
 
-tol = 1e-8
+tol = 1e-10
 
-# TODO: use GeneralizedLinearEstimator (to test global lipschtiz constants of every datafit)
-# TODO: test sparse matrices (global lipschitz constants)
-@pytest.mark.parametrize("Datafit, Penalty, Estimator", [
-    (Quadratic, L1, Lasso),
-    (Logistic, L1, SparseLogisticRegression),
-    (QuadraticSVC, IndicatorBox, LinearSVC),
+
+@pytest.mark.parametrize("X", [X, X_sparse])
+@pytest.mark.parametrize("Datafit, Penalty", [
+    (Quadratic, L1),
+    (Logistic, L1),
+    (QuadraticSVC, IndicatorBox),
 ])
-def test_fista_solver(Datafit, Penalty, Estimator):
+def test_fista_solver(X, Datafit, Penalty):
     _y = y if isinstance(Datafit, Quadratic) else y_classif
     datafit = compiled_clone(Datafit())
     _init = y @ X.T if isinstance(Datafit, QuadraticSVC) else X
-    datafit.initialize(_init, _y)
+    if issparse(X):
+        datafit.initialize_sparse(_init.data, _init.indptr, _init.indices, _y)
+    else:
+        datafit.initialize(_init, _y)
     penalty = compiled_clone(Penalty(alpha))
 
-    solver = FISTA(max_iter=1000, tol=tol)
+    solver = FISTA(max_iter=1000, tol=tol, opt_freq=1)
     w = solver.solve(X, _y, datafit, penalty)
 
-    estimator = Estimator(alpha, tol=tol, fit_intercept=False)
-    estimator.fit(X, _y)
+    solver_cd = AndersonCD(tol=tol, fit_intercept=False)
+    w_cd = solver_cd.solve(X, _y, datafit, penalty)[0]
 
-    np.testing.assert_allclose(w, estimator.coef_.flatten(), rtol=1e-3)
+    np.testing.assert_allclose(w, w_cd, rtol=1e-3)
 
 
 if __name__ == '__main__':

From 9f0653a5165d278c84774ccc3d8cbe64b59cb4d7 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Fri, 14 Oct 2022 22:20:42 +0200
Subject: [PATCH 09/29] support sparse matrices

---
 skglm/solvers/fista.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index aaafca104..b058033a5 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -1,7 +1,8 @@
 import numpy as np
+from scipy.sparse import issparse
 from numba import njit
 from skglm.solvers.base import BaseSolver
-from skglm.solvers.common import construct_grad
+from skglm.solvers.common import construct_grad, construct_grad_sparse
 
 
 @njit
@@ -64,7 +65,11 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
             t_old = t_new
             t_new = (1 + np.sqrt(1 + 4 * t_old ** 2)) / 2
             w_old = w.copy()
-            grad = construct_grad(X, y, z, X @ z, datafit, all_features)
+            if issparse(X):
+                grad = construct_grad_sparse(
+                    X.data, X.indptr, X.indices, y, z, X @ z, datafit, all_features) 
+            else:
+                grad = construct_grad(X, y, z, X @ z, datafit, all_features)
             z -= grad / lipschitz
             w = _prox_vec(w, z, penalty, lipschitz)
             Xw = X @ w

From fe159beed85fcdfb89364d23fd2bd47a455b5f63 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Fri, 14 Oct 2022 22:20:50 +0200
Subject: [PATCH 10/29] fix mistake

---
 skglm/datafits/single_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
index 583312083..df6b860b2 100644
--- a/skglm/datafits/single_task.py
+++ b/skglm/datafits/single_task.py
@@ -231,7 +231,7 @@ def params_to_dict(self):
     def initialize(self, yXT, y):
         n_features = yXT.shape[1]
         self.lipschitz = np.zeros(n_features, dtype=yXT.dtype)
-        self.global_lipschitz = norm(yXT, ord=2) ** 2 / len(y)
+        self.global_lipschitz = norm(yXT, ord=2) ** 2
         for j in range(n_features):
             self.lipschitz[j] = norm(yXT[:, j]) ** 2
 

From 8e74e8a232894fb2f92cef3dc9582b557b05da85 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Fri, 14 Oct 2022 22:32:35 +0200
Subject: [PATCH 11/29] RM toy_fista

---
 toy_fista.py | 51 ---------------------------------------------------
 1 file changed, 51 deletions(-)
 delete mode 100644 toy_fista.py

diff --git a/toy_fista.py b/toy_fista.py
deleted file mode 100644
index 1c29bf713..000000000
--- a/toy_fista.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import numpy as np
-from numpy.linalg import norm
-from skglm.datafits.single_task import Quadratic, Logistic
-from skglm.solvers import FISTA
-from skglm.penalties import L1
-from skglm.estimators import SparseLogisticRegression, Lasso
-from skglm.utils import make_correlated_data, compiled_clone
-
-
-X, y, _ = make_correlated_data(n_samples=200, n_features=100, random_state=24)
-
-n_samples, n_features = X.shape
-alpha_max = norm(X.T @ y, ord=np.inf) / n_samples
-
-alpha = alpha_max / 10
-
-max_iter = 1000
-obj_freq = 100
-tol = 1e-10
-
-solver = FISTA(max_iter=max_iter, tol=tol, opt_freq=obj_freq, verbose=1)
-
-##############
-# Quadratic
-##############
-datafit = compiled_clone(Quadratic())
-datafit.initialize(X, y)
-penalty = compiled_clone(L1(alpha))
-w = solver.solve(X, y, datafit, penalty)
-
-clf = Lasso(alpha=alpha, tol=tol, fit_intercept=False)
-clf.fit(X, y)
-
-np.testing.assert_allclose(w, clf.coef_, rtol=1e-5)
-
-##############
-# Logistic
-##############
-y = np.sign(y)
-alpha_max = norm(X.T @ y, ord=np.inf) / (4 * n_samples)
-alpha = alpha_max / 10
-
-datafit = compiled_clone(Logistic())
-datafit.initialize(X, y)
-penalty = compiled_clone(L1(alpha))
-w = solver.solve(X, y, datafit, penalty)
-
-clf = SparseLogisticRegression(alpha=alpha, tol=tol, fit_intercept=False)
-clf.fit(X, y)
-
-np.testing.assert_allclose(w, np.squeeze(clf.coef_), rtol=1e-3)

From a24ed9c6d5156bf02b4fee850cacd9484d11738a Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Fri, 14 Oct 2022 22:37:24 +0200
Subject: [PATCH 12/29] green

---
 skglm/solvers/fista.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index b058033a5..dc2f60ccb 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -67,7 +67,7 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
             w_old = w.copy()
             if issparse(X):
                 grad = construct_grad_sparse(
-                    X.data, X.indptr, X.indices, y, z, X @ z, datafit, all_features) 
+                    X.data, X.indptr, X.indices, y, z, X @ z, datafit, all_features)
             else:
                 grad = construct_grad(X, y, z, X @ z, datafit, all_features)
             z -= grad / lipschitz

From 4362c2c02c3d3c0303eb2b75d306afa253e7dba9 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Sun, 16 Oct 2022 14:10:55 +0200
Subject: [PATCH 13/29] mv `_prox_vec` to utils

---
 skglm/solvers/fista.py | 12 ++----------
 skglm/utils.py         | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index dc2f60ccb..7214d0c2e 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -1,16 +1,8 @@
 import numpy as np
 from scipy.sparse import issparse
-from numba import njit
 from skglm.solvers.base import BaseSolver
 from skglm.solvers.common import construct_grad, construct_grad_sparse
-
-
-@njit
-def _prox_vec(w, z, penalty, lipschitz):
-    n_features = w.shape[0]
-    for j in range(n_features):
-        w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j)
-    return w
+from skglm.utils import prox_vec
 
 
 class FISTA(BaseSolver):
@@ -71,7 +63,7 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
             else:
                 grad = construct_grad(X, y, z, X @ z, datafit, all_features)
             z -= grad / lipschitz
-            w = _prox_vec(w, z, penalty, lipschitz)
+            w = prox_vec(w, z, penalty, lipschitz)
             Xw = X @ w
             z = w + (t_old - 1.) / t_new * (w - w_old)
 
diff --git a/skglm/utils.py b/skglm/utils.py
index 55c2fa462..bffa72c77 100644
--- a/skglm/utils.py
+++ b/skglm/utils.py
@@ -457,3 +457,32 @@ def extrapolate(self, w, Xw):
         C = inv_UTU_ones / np.sum(inv_UTU_ones)
         # floating point errors may cause w and Xw to disagree
         return self.arr_w_[:, 1:] @ C, self.arr_Xw_[:, 1:] @ C, True
+
+
+@njit
+def prox_vec(w, z, penalty, lipschitz):
+    """Evaluate the vectorized proximal operator for the FISTA solver.
+
+    Parameters
+    ----------
+    w : array, shape (n_features,)
+        Coefficient vector.
+
+    z : array, shape (n_features,)
+        FISTA auxiliary variable.
+
+    penalty : instance of Penalty.
+        Penalty object.
+
+    lipschitz : float
+        Global Lipschitz constant.
+
+    Returns
+    -------
+    w : array; shape (n_features,)
+        Updated coefficient vector.
+    """
+    n_features = w.shape[0]
+    for j in range(n_features):
+        w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j)
+    return w

From 2665d5d1853cde35361e0176647b1c2f84f36412 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Sun, 16 Oct 2022 14:11:58 +0200
Subject: [PATCH 14/29] rm `opt_freq`

---
 skglm/solvers/fista.py | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index 7214d0c2e..4b1e709d8 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -18,9 +18,6 @@ class FISTA(BaseSolver):
     tol : float, default 1e-4
         Tolerance for convergence.
 
-    opt_freq : int, default 10
-        Frequency for optimality condition check.
-
     verbose : bool, default False
         Amount of verbosity. 0/False is silent.
 
@@ -32,10 +29,9 @@ class FISTA(BaseSolver):
            https://epubs.siam.org/doi/10.1137/080716542
     """
 
-    def __init__(self, max_iter=100, tol=1e-4, opt_freq=10, verbose=0):
+    def __init__(self, max_iter=100, tol=1e-4, verbose=0):
         self.max_iter = max_iter
         self.tol = tol
-        self.opt_freq = opt_freq
         self.verbose = verbose
 
     def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
@@ -67,19 +63,18 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
             Xw = X @ w
             z = w + (t_old - 1.) / t_new * (w - w_old)
 
-            if n_iter % self.opt_freq == 0:
-                opt = penalty.subdiff_distance(w, grad, all_features)
-                stop_crit = np.max(opt)
+            opt = penalty.subdiff_distance(w, grad, all_features)
+            stop_crit = np.max(opt)
 
-                if self.verbose:
-                    p_obj = datafit.value(y, w, Xw) + penalty.value(w)
-                    print(
-                        f"Iteration {n_iter+1}: {p_obj:.10f}, "
-                        f"stopping crit: {stop_crit:.2e}"
-                    )
+            if self.verbose:
+                p_obj = datafit.value(y, w, Xw) + penalty.value(w)
+                print(
+                    f"Iteration {n_iter+1}: {p_obj:.10f}, "
+                    f"stopping crit: {stop_crit:.2e}"
+                )
 
-                if stop_crit < self.tol:
-                    if self.verbose:
-                        print(f"Stopping criterion max violation: {stop_crit:.2e}")
-                    break
+            if stop_crit < self.tol:
+                if self.verbose:
+                    print(f"Stopping criterion max violation: {stop_crit:.2e}")
+                break
         return w

From 2e408bc6100ba993527bbc0bbbb234099dd3c70f Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Sun, 16 Oct 2022 14:22:11 +0200
Subject: [PATCH 15/29] fix tests

---
 skglm/solvers/fista.py    | 6 ++++--
 skglm/tests/test_fista.py | 8 ++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index 4b1e709d8..0732d066f 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -35,6 +35,7 @@ def __init__(self, max_iter=100, tol=1e-4, verbose=0):
         self.verbose = verbose
 
     def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
+        p_objs_out = []
         n_samples, n_features = X.shape
         all_features = np.arange(n_features)
         t_new = 1
@@ -66,8 +67,9 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
             opt = penalty.subdiff_distance(w, grad, all_features)
             stop_crit = np.max(opt)
 
+            p_obj = datafit.value(y, w, Xw) + penalty.value(w)
+            p_objs_out.append(p_obj)
             if self.verbose:
-                p_obj = datafit.value(y, w, Xw) + penalty.value(w)
                 print(
                     f"Iteration {n_iter+1}: {p_obj:.10f}, "
                     f"stopping crit: {stop_crit:.2e}"
@@ -77,4 +79,4 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
                 if self.verbose:
                     print(f"Stopping criterion max violation: {stop_crit:.2e}")
                 break
-        return w
+        return w, np.array(p_objs_out), stop_crit
diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py
index 04c1ed516..f490c11a4 100644
--- a/skglm/tests/test_fista.py
+++ b/skglm/tests/test_fista.py
@@ -39,13 +39,13 @@ def test_fista_solver(X, Datafit, Penalty):
         datafit.initialize(_init, _y)
     penalty = compiled_clone(Penalty(alpha))
 
-    solver = FISTA(max_iter=1000, tol=tol, opt_freq=1)
-    w = solver.solve(X, _y, datafit, penalty)
+    solver = FISTA(max_iter=1000, tol=tol) 
+    res_fista = solver.solve(X, _y, datafit, penalty)
 
     solver_cd = AndersonCD(tol=tol, fit_intercept=False)
-    w_cd = solver_cd.solve(X, _y, datafit, penalty)[0]
+    res_cd = solver_cd.solve(X, _y, datafit, penalty)
 
-    np.testing.assert_allclose(w, w_cd, rtol=1e-3)
+    np.testing.assert_allclose(res_fista[0], res_cd[0], rtol=1e-3)
 
 
 if __name__ == '__main__':

From 8524cf7b0180b061d27e2522bd8ebb09d1fbae5a Mon Sep 17 00:00:00 2001
From: PAB <pierreantoine.bannier@gmail.com>
Date: Sun, 16 Oct 2022 14:23:26 +0200
Subject: [PATCH 16/29] Update skglm/solvers/fista.py

Co-authored-by: Badr MOUFAD <65614794+Badr-MOUFAD@users.noreply.github.com>
---
 skglm/solvers/fista.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index 0732d066f..1779dd3e3 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -38,7 +38,7 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
         p_objs_out = []
         n_samples, n_features = X.shape
         all_features = np.arange(n_features)
-        t_new = 1
+        t_new = 1.
 
         w = w_init.copy() if w_init is not None else np.zeros(n_features)
         z = w_init.copy() if w_init is not None else np.zeros(n_features)

From dd658f8954ff8edd7f1a8a818c9510b3beb4f13a Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Sun, 16 Oct 2022 14:27:06 +0200
Subject: [PATCH 17/29] huber comment

---
 skglm/datafits/single_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
index df6b860b2..3bf1eee0e 100644
--- a/skglm/datafits/single_task.py
+++ b/skglm/datafits/single_task.py
@@ -290,7 +290,7 @@ class Huber(BaseDatafit):
     Attributes
     ----------
     delta : float
-        Shape hyperparameter.
+        Threshold hyperparameter.
 
     lipschitz : array, shape (n_features,)
         The coordinatewise gradient Lipschitz constants. Equal to

From cbc54182d1215e848241780c7fbd21ed836f45d1 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Sun, 16 Oct 2022 19:05:19 +0200
Subject: [PATCH 18/29] WIP

---
 skglm/datafits/single_task.py |  3 +--
 skglm/utils.py                | 41 +++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
index 3bf1eee0e..0fd19dbe5 100644
--- a/skglm/datafits/single_task.py
+++ b/skglm/datafits/single_task.py
@@ -158,11 +158,10 @@ def initialize(self, X, y):
     def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
-        self.global_lipschitz = 0.
         for j in range(n_features):
             Xj = X_data[X_indptr[j]:X_indptr[j+1]]
             self.lipschitz[j] = (Xj ** 2).sum() / (len(y) * 4)
-            self.global_lipschitz += (Xj ** 2).sum() / (len(y) * 4)
+        self.global_lipschitz = 0.
 
     def value(self, y, w, Xw):
         return np.log(1. + np.exp(- y * Xw)).sum() / len(y)
diff --git a/skglm/utils.py b/skglm/utils.py
index bffa72c77..23215d244 100644
--- a/skglm/utils.py
+++ b/skglm/utils.py
@@ -486,3 +486,44 @@ def prox_vec(w, z, penalty, lipschitz):
     for j in range(n_features):
         w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j)
     return w
+
+
+@njit
+def power_method(X_data, X_indptr, X_indices, n_iter):
+    """Power method to compute largest eigenvalue of sparse matrix X.
+    
+    Parameters
+    ----------
+    X_data : array, shape (n_elements,)
+        `data` attribute of the sparse CSC matrix X.
+
+    X_indptr : array, shape (n_features + 1,)
+        `indptr` attribute of the sparse CSC matrix X.
+
+    X_indices : array, shape (n_elements,)
+        `indices` attribute of the sparse CSC matrix X.
+    
+    n_iter : int
+        Number of iterations for the power method.
+    
+    Returns
+    -------
+    rayleigh : float
+        Rayleigh quotient or spectral radius of X^TX
+    """
+    n_features = len(X_indptr) - 1
+    b_k = np.random.rand(n_features)
+    for _ in range(n_iter):
+        b_k1 = np.zeros(n_features)
+        for j in range(n_features):
+            b_k1[j] = 
+        b_k1 = A @ b_k  # write the full loop
+        b_k1_nrm = norm(b_k1)
+        b_k = b_k1 / b_k1_nrm
+    rayleigh = b_k1 @ b_k / (norm(b_k) ** 2)
+    return rayleigh
+
+
+
+
+

From e76dfb10f8f507ea7ed49ac3e9a3db926719d6d5 Mon Sep 17 00:00:00 2001
From: Badr MOUFAD <Badr.MOUFAD@emines.um6p.ma>
Date: Fri, 21 Oct 2022 01:00:57 +0200
Subject: [PATCH 19/29] implement power method

---
 skglm/tests/test_fista.py |  12 +++-
 skglm/utils.py            | 116 ++++++++++++++++++++++++++++++--------
 2 files changed, 104 insertions(+), 24 deletions(-)

diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py
index f490c11a4..617c71848 100644
--- a/skglm/tests/test_fista.py
+++ b/skglm/tests/test_fista.py
@@ -7,7 +7,7 @@
 from skglm.datafits import Quadratic, Logistic, QuadraticSVC
 from skglm.penalties import L1, IndicatorBox
 from skglm.solvers import FISTA, AndersonCD
-from skglm.utils import make_correlated_data, compiled_clone
+from skglm.utils import make_correlated_data, compiled_clone, spectral_norm2
 
 
 np.random.seed(0)
@@ -39,7 +39,7 @@ def test_fista_solver(X, Datafit, Penalty):
         datafit.initialize(_init, _y)
     penalty = compiled_clone(Penalty(alpha))
 
-    solver = FISTA(max_iter=1000, tol=tol) 
+    solver = FISTA(max_iter=1000, tol=tol)
     res_fista = solver.solve(X, _y, datafit, penalty)
 
     solver_cd = AndersonCD(tol=tol, fit_intercept=False)
@@ -48,5 +48,13 @@ def test_fista_solver(X, Datafit, Penalty):
     np.testing.assert_allclose(res_fista[0], res_cd[0], rtol=1e-3)
 
 
+def test_spectral_norm2():
+    X_bundles = (X_sparse.data, X_sparse.indptr, X_sparse.indices)
+    lipschitz_our = spectral_norm2(*X_bundles, n_samples=len(y))
+    lipschitz_np = norm(X_sparse.toarray(), ord=2) ** 2
+
+    np.testing.assert_allclose(lipschitz_our, lipschitz_np, atol=1e-4, rtol=1e-5)
+
+
 if __name__ == '__main__':
     pass
diff --git a/skglm/utils.py b/skglm/utils.py
index 23215d244..8b791f033 100644
--- a/skglm/utils.py
+++ b/skglm/utils.py
@@ -488,42 +488,114 @@ def prox_vec(w, z, penalty, lipschitz):
     return w
 
 
+POWER_MAX_ITER = 20
+TOL = 1e-6 ** 2
+
+
 @njit
-def power_method(X_data, X_indptr, X_indices, n_iter):
-    """Power method to compute largest eigenvalue of sparse matrix X.
-    
+def spectral_norm2(X_data, X_indptr, X_indices, n_samples):
+    """Compute the squared spectral norm of sparse matrix ``X``.
+
+    Find the largest eigenvalue of ``X @ X.T`` using the power method.
+
     Parameters
     ----------
     X_data : array, shape (n_elements,)
-        `data` attribute of the sparse CSC matrix X.
+         `data` attribute of the sparse CSC matrix ``X``.
 
     X_indptr : array, shape (n_features + 1,)
-        `indptr` attribute of the sparse CSC matrix X.
+         `indptr` attribute of the sparse CSC matrix ``X``.
 
     X_indices : array, shape (n_elements,)
-        `indices` attribute of the sparse CSC matrix X.
-    
-    n_iter : int
-        Number of iterations for the power method.
-    
+         `indices` attribute of the sparse CSC matrix ``X``.
+
+    n_samples : int
+        number of rows of ``X``.
+
     Returns
     -------
-    rayleigh : float
-        Rayleigh quotient or spectral radius of X^TX
+    eigenvalue : float
+        The largest eigenvalue value of ``X.T @ X``, aka the squared spectral of ``X``.
+
+    References
+    ----------
+    .. [1] Alfio Quarteroni, Riccardo Sacco, Fausto Saleri "Numerical Mathematics",
+        chapiter 5, page 192-195.
     """
+    # init vec with norm(vec) == 1.
+    eigenvector = np.full(n_samples, 1/np.sqrt(n_samples))
+    eigenvalue = 1.
+
+    for _ in range(POWER_MAX_ITER):
+        vec = _XXT_dot_vec(X_data, X_indptr, X_indices, eigenvector, n_samples)
+        norm_vec = norm(vec)
+        eigenvalue = vec @ eigenvector
+
+        # norm(X @ X.T @ eigenvector - eigenvalue * eigenvector) <= tol
+        if norm_vec ** 2 - eigenvalue ** 2 <= TOL:
+            break
+
+        eigenvector = vec / norm_vec
+
+    return eigenvalue
+
+
+@njit
+def _XXT_dot_vec(X_data, X_indptr, X_indices, vec, n_samples):
+    # computes X @ X.T @ vec, with X csc encoded
+    return _X_dot_vec(X_data, X_indptr, X_indices,
+                      _XT_dot_vec(X_data, X_indptr, X_indices, vec),
+                      n_samples)
+
+
+@njit
+def _X_dot_vec(X_data, X_indptr, X_indices, vec, n_samples):
+    # compute X @ vec, with X csc encoded
+    result = np.zeros(n_samples)
+
+    # loop over features
+    for j in range(len(X_indptr) - 1):
+        if vec[j] == 0:
+            continue
+
+        col_j_rows_idx = slice(X_indptr[j], X_indptr[j+1])
+        result[X_indices[col_j_rows_idx]] += vec[j] * X_data[col_j_rows_idx]
+
+    return result
+
+
+@njit
+def _XT_dot_vec(X_data, X_indptr, X_indices, vec):
+    # compute X.T @ vec, with X csc encoded
     n_features = len(X_indptr) - 1
-    b_k = np.random.rand(n_features)
-    for _ in range(n_iter):
-        b_k1 = np.zeros(n_features)
-        for j in range(n_features):
-            b_k1[j] = 
-        b_k1 = A @ b_k  # write the full loop
-        b_k1_nrm = norm(b_k1)
-        b_k = b_k1 / b_k1_nrm
-    rayleigh = b_k1 @ b_k / (norm(b_k) ** 2)
-    return rayleigh
+    result = np.zeros(n_features)
+
+    for j in range(n_features):
+        for idx in range(X_indptr[j], X_indptr[j+1]):
+            result[j] += X_data[idx] * vec[X_indices[idx]]
+
+    return result
+
 
+if __name__ == '__main__':
+    from scipy.sparse import csc_matrix, random
+    import time
+    n_samples, n_features = 500, 600
+    A = random(n_samples, n_features, density=0.5, format='csc')
 
+    X = csc_matrix(A)
+    X_dense = X.toarray()
 
+    # cache numba compilation
+    M = random(5, 7, density=0.9, format='csc')
+    spectral_norm2(M.data, M.indptr, M.indices, 5)
 
+    start = time.time()
+    spectral_norm2(X.data, X.indptr, X.indices, n_samples)
+    end = time.time()
+    print("our: ", end - start)
 
+    start = time.time()
+    norm(X_dense, ord=2) ** 2
+    end = time.time()
+    print("np: ", end - start)

From 2a4bce31274063476b6fe0aa313e39265cd07c3c Mon Sep 17 00:00:00 2001
From: Badr MOUFAD <Badr.MOUFAD@emines.um6p.ma>
Date: Fri, 21 Oct 2022 01:07:07 +0200
Subject: [PATCH 20/29] private ``prox_vec``

---
 skglm/solvers/fista.py |  4 ++--
 skglm/utils.py         | 25 +++----------------------
 2 files changed, 5 insertions(+), 24 deletions(-)

diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index 1779dd3e3..417af1a7e 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -2,7 +2,7 @@
 from scipy.sparse import issparse
 from skglm.solvers.base import BaseSolver
 from skglm.solvers.common import construct_grad, construct_grad_sparse
-from skglm.utils import prox_vec
+from skglm.utils import _prox_vec
 
 
 class FISTA(BaseSolver):
@@ -60,7 +60,7 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
             else:
                 grad = construct_grad(X, y, z, X @ z, datafit, all_features)
             z -= grad / lipschitz
-            w = prox_vec(w, z, penalty, lipschitz)
+            w = _prox_vec(w, z, penalty, lipschitz)
             Xw = X @ w
             z = w + (t_old - 1.) / t_new * (w - w_old)
 
diff --git a/skglm/utils.py b/skglm/utils.py
index 8b791f033..230719a0c 100644
--- a/skglm/utils.py
+++ b/skglm/utils.py
@@ -460,28 +460,9 @@ def extrapolate(self, w, Xw):
 
 
 @njit
-def prox_vec(w, z, penalty, lipschitz):
-    """Evaluate the vectorized proximal operator for the FISTA solver.
-
-    Parameters
-    ----------
-    w : array, shape (n_features,)
-        Coefficient vector.
-
-    z : array, shape (n_features,)
-        FISTA auxiliary variable.
-
-    penalty : instance of Penalty.
-        Penalty object.
-
-    lipschitz : float
-        Global Lipschitz constant.
-
-    Returns
-    -------
-    w : array; shape (n_features,)
-        Updated coefficient vector.
-    """
+def _prox_vec(w, z, penalty, lipschitz):
+    # evaluate the vectorized proximal operator for the FISTA solver
+    # lipschitz stands for global lipschitz constant
     n_features = w.shape[0]
     for j in range(n_features):
         w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j)

From cd39a62290aa3c62a4d68a5c98d2811125c7d74d Mon Sep 17 00:00:00 2001
From: Badr MOUFAD <Badr.MOUFAD@emines.um6p.ma>
Date: Fri, 21 Oct 2022 10:22:33 +0200
Subject: [PATCH 21/29] random init in power method && default args

---
 skglm/utils.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/skglm/utils.py b/skglm/utils.py
index 230719a0c..75c993c57 100644
--- a/skglm/utils.py
+++ b/skglm/utils.py
@@ -469,12 +469,9 @@ def _prox_vec(w, z, penalty, lipschitz):
     return w
 
 
-POWER_MAX_ITER = 20
-TOL = 1e-6 ** 2
-
-
 @njit
-def spectral_norm2(X_data, X_indptr, X_indices, n_samples):
+def spectral_norm2(X_data, X_indptr, X_indices, n_samples,
+                   max_iter=20, tol=1e-6):
     """Compute the squared spectral norm of sparse matrix ``X``.
 
     Find the largest eigenvalue of ``X @ X.T`` using the power method.
@@ -503,17 +500,22 @@ def spectral_norm2(X_data, X_indptr, X_indices, n_samples):
     .. [1] Alfio Quarteroni, Riccardo Sacco, Fausto Saleri "Numerical Mathematics",
         chapiter 5, page 192-195.
     """
+    # tol is squared as we evaluate the square of inequality (5.25) in ref [1]
+    tol = tol ** 2
+
     # init vec with norm(vec) == 1.
-    eigenvector = np.full(n_samples, 1/np.sqrt(n_samples))
+    eigenvector = np.random.randn(n_samples)
+    eigenvector /= norm(eigenvector)
     eigenvalue = 1.
 
-    for _ in range(POWER_MAX_ITER):
+    for _ in range(max_iter):
         vec = _XXT_dot_vec(X_data, X_indptr, X_indices, eigenvector, n_samples)
         norm_vec = norm(vec)
         eigenvalue = vec @ eigenvector
 
         # norm(X @ X.T @ eigenvector - eigenvalue * eigenvector) <= tol
-        if norm_vec ** 2 - eigenvalue ** 2 <= TOL:
+        # inequality is squared
+        if norm_vec ** 2 - eigenvalue ** 2 <= tol:
             break
 
         eigenvector = vec / norm_vec

From 0e4d42a49c82341a4a864742d3998c02ae07fc3e Mon Sep 17 00:00:00 2001
From: Badr MOUFAD <Badr.MOUFAD@emines.um6p.ma>
Date: Fri, 21 Oct 2022 11:45:56 +0200
Subject: [PATCH 22/29] use power method for ``global_lipschitz``

---
 skglm/datafits/single_task.py | 33 ++++++++++++++++++++-------------
 skglm/utils.py                |  2 +-
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
index 0fd19dbe5..c2bd741ff 100644
--- a/skglm/datafits/single_task.py
+++ b/skglm/datafits/single_task.py
@@ -4,6 +4,7 @@
 from numba import float64
 
 from skglm.datafits.base import BaseDatafit
+from skglm.utils import spectral_norm2
 
 
 class Quadratic(BaseDatafit):
@@ -54,12 +55,14 @@ def initialize(self, X, y):
         for j in range(n_features):
             self.lipschitz[j] = (X[:, j] ** 2).sum() / len(y)
 
-    def initialize_sparse(
-            self, X_data, X_indptr, X_indices, y):
+    def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
         self.Xty = np.zeros(n_features, dtype=X_data.dtype)
+
+        self.global_lipschitz = spectral_norm2(X_data, X_indptr, X_indices, len(y))
+        self.global_lipschitz /= len(y)
+
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
-        self.global_lipschitz = 0.
         for j in range(n_features):
             nrm2 = 0.
             xty = 0
@@ -69,7 +72,6 @@ def initialize_sparse(
 
             self.lipschitz[j] = nrm2 / len(y)
             self.Xty[j] = xty
-            self.global_lipschitz += nrm2 / len(y)
 
     def value(self, y, w, Xw):
         return np.sum((y - Xw) ** 2) / (2 * len(Xw))
@@ -157,11 +159,14 @@ def initialize(self, X, y):
 
     def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
+
+        self.global_lipschitz = spectral_norm2(X_data, X_indptr, X_indices, len(y))
+        self.global_lipschitz /= (len(y) * 4)
+
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
         for j in range(n_features):
             Xj = X_data[X_indptr[j]:X_indptr[j+1]]
             self.lipschitz[j] = (Xj ** 2).sum() / (len(y) * 4)
-        self.global_lipschitz = 0.
 
     def value(self, y, w, Xw):
         return np.log(1. + np.exp(- y * Xw)).sum() / len(y)
@@ -234,17 +239,18 @@ def initialize(self, yXT, y):
         for j in range(n_features):
             self.lipschitz[j] = norm(yXT[:, j]) ** 2
 
-    def initialize_sparse(
-            self, yXT_data, yXT_indptr, yXT_indices, y):
+    def initialize_sparse(self, yXT_data, yXT_indptr, yXT_indices, y):
         n_features = len(yXT_indptr) - 1
+
+        self.global_lipschitz = spectral_norm2(
+            yXT_data, yXT_indptr, yXT_indices, len(y))
+
         self.lipschitz = np.zeros(n_features, dtype=yXT_data.dtype)
-        self.global_lipschitz = 0.
         for j in range(n_features):
             nrm2 = 0.
             for idx in range(yXT_indptr[j], yXT_indptr[j + 1]):
                 nrm2 += yXT_data[idx] ** 2
             self.lipschitz[j] = nrm2
-            self.global_lipschitz += nrm2
 
     def value(self, y, w, yXTw):
         return (yXTw ** 2).sum() / 2 - np.sum(w)
@@ -327,17 +333,18 @@ def initialize(self, X, y):
             self.lipschitz[j] = (X[:, j] ** 2).sum() / len(y)
             self.global_lipschitz += (X[:, j] ** 2).sum() / len(y)
 
-    def initialize_sparse(
-            self, X_data, X_indptr, X_indices, y):
+    def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
+
+        self.global_lipschitz = spectral_norm2(X_data, X_indptr, X_indices, len(y))
+        self.global_lipschitz /= len(y)
+
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
-        self.global_lipschitz = 0.
         for j in range(n_features):
             nrm2 = 0.
             for idx in range(X_indptr[j], X_indptr[j + 1]):
                 nrm2 += X_data[idx] ** 2
             self.lipschitz[j] = nrm2 / len(y)
-            self.global_lipschitz += nrm2 / len(y)
 
     def value(self, y, w, Xw):
         n_samples = len(y)
diff --git a/skglm/utils.py b/skglm/utils.py
index 75c993c57..4f83c8788 100644
--- a/skglm/utils.py
+++ b/skglm/utils.py
@@ -493,7 +493,7 @@ def spectral_norm2(X_data, X_indptr, X_indices, n_samples,
     Returns
     -------
     eigenvalue : float
-        The largest eigenvalue value of ``X.T @ X``, aka the squared spectral of ``X``.
+        The largest eigenvalue of ``X.T @ X``, aka the squared spectral norm of ``X``.
 
     References
     ----------

From 2bbc8f5394bccae0ce896d2a3dd8870409a337a1 Mon Sep 17 00:00:00 2001
From: Badr MOUFAD <Badr.MOUFAD@emines.um6p.ma>
Date: Fri, 21 Oct 2022 11:46:13 +0200
Subject: [PATCH 23/29] fix && refactor unittest

---
 skglm/tests/test_fista.py | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py
index 617c71848..528fe690a 100644
--- a/skglm/tests/test_fista.py
+++ b/skglm/tests/test_fista.py
@@ -2,23 +2,27 @@
 
 import numpy as np
 from numpy.linalg import norm
+
+import scipy.sparse
 from scipy.sparse import csc_matrix, issparse
 
-from skglm.datafits import Quadratic, Logistic, QuadraticSVC
 from skglm.penalties import L1, IndicatorBox
 from skglm.solvers import FISTA, AndersonCD
+from skglm.datafits import Quadratic, Logistic, QuadraticSVC
 from skglm.utils import make_correlated_data, compiled_clone, spectral_norm2
 
 
-np.random.seed(0)
+random_state = 113
 n_samples, n_features = 50, 60
-X, y, _ = make_correlated_data(
-    n_samples=n_samples, n_features=n_features, random_state=0)
-X_sparse = csc_matrix(X * np.random.binomial(1, 0.1, X.shape))
+
+rng = np.random.RandomState(random_state)
+X, y, _ = make_correlated_data(n_samples, n_features, random_state=rng)
+rng.seed(random_state)
+X_sparse = csc_matrix(X * np.random.binomial(1, 0.5, X.shape))
 y_classif = np.sign(y)
 
 alpha_max = norm(X.T @ y, ord=np.inf) / len(y)
-alpha = alpha_max / 100
+alpha = alpha_max / 10
 
 tol = 1e-10
 
@@ -40,20 +44,23 @@ def test_fista_solver(X, Datafit, Penalty):
     penalty = compiled_clone(Penalty(alpha))
 
     solver = FISTA(max_iter=1000, tol=tol)
-    res_fista = solver.solve(X, _y, datafit, penalty)
+    w_fista = solver.solve(X, _y, datafit, penalty)[0]
 
     solver_cd = AndersonCD(tol=tol, fit_intercept=False)
-    res_cd = solver_cd.solve(X, _y, datafit, penalty)
+    w_cd = solver_cd.solve(X, _y, datafit, penalty)[0]
 
-    np.testing.assert_allclose(res_fista[0], res_cd[0], rtol=1e-3)
+    np.testing.assert_allclose(w_fista, w_cd, atol=1e-7)
 
 
 def test_spectral_norm2():
-    X_bundles = (X_sparse.data, X_sparse.indptr, X_sparse.indices)
-    lipschitz_our = spectral_norm2(*X_bundles, n_samples=len(y))
-    lipschitz_np = norm(X_sparse.toarray(), ord=2) ** 2
+    n_samples, n_features = 50, 60
+    A_sparse = scipy.sparse.random(n_samples, n_features, density=0.5, format='csc')
+
+    A_bundles = (A_sparse.data, A_sparse.indptr, A_sparse.indices)
+    lipschitz_our = spectral_norm2(*A_bundles, n_samples=len(y))
+    lipschitz_np = norm(A_sparse.toarray(), ord=2) ** 2
 
-    np.testing.assert_allclose(lipschitz_our, lipschitz_np, atol=1e-4, rtol=1e-5)
+    np.testing.assert_allclose(lipschitz_our, lipschitz_np)
 
 
 if __name__ == '__main__':

From ed3686a9dde61dff7ff62aa6733c5f37e7f432b1 Mon Sep 17 00:00:00 2001
From: Badr MOUFAD <Badr.MOUFAD@emines.um6p.ma>
Date: Fri, 21 Oct 2022 11:50:20 +0200
Subject: [PATCH 24/29] add docs for tol and max_iter && clean ups

---
 skglm/utils.py | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/skglm/utils.py b/skglm/utils.py
index 4f83c8788..4e563bad0 100644
--- a/skglm/utils.py
+++ b/skglm/utils.py
@@ -490,6 +490,12 @@ def spectral_norm2(X_data, X_indptr, X_indices, n_samples,
     n_samples : int
         number of rows of ``X``.
 
+    max_iter : int, default 20
+        Maximum number of power method iterations.
+
+    tol : float, default 1e-6
+        Tolerance for convergence.
+
     Returns
     -------
     eigenvalue : float
@@ -561,24 +567,4 @@ def _XT_dot_vec(X_data, X_indptr, X_indices, vec):
 
 
 if __name__ == '__main__':
-    from scipy.sparse import csc_matrix, random
-    import time
-    n_samples, n_features = 500, 600
-    A = random(n_samples, n_features, density=0.5, format='csc')
-
-    X = csc_matrix(A)
-    X_dense = X.toarray()
-
-    # cache numba compilation
-    M = random(5, 7, density=0.9, format='csc')
-    spectral_norm2(M.data, M.indptr, M.indices, 5)
-
-    start = time.time()
-    spectral_norm2(X.data, X.indptr, X.indices, n_samples)
-    end = time.time()
-    print("our: ", end - start)
-
-    start = time.time()
-    norm(X_dense, ord=2) ** 2
-    end = time.time()
-    print("np: ", end - start)
+    pass

From aa15c46c26d9c233a35d4319d5449d39b9842867 Mon Sep 17 00:00:00 2001
From: Badr-MOUFAD <badr.moufad@emines.um6p.ma>
Date: Fri, 21 Oct 2022 13:53:51 +0200
Subject: [PATCH 25/29] remove square form spectral norm

---
 skglm/datafits/single_task.py | 12 +++++------
 skglm/tests/test_fista.py     | 14 +++++++------
 skglm/utils.py                | 38 +++++++++++++++++++++++++++--------
 3 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
index c2bd741ff..63047f6fe 100644
--- a/skglm/datafits/single_task.py
+++ b/skglm/datafits/single_task.py
@@ -4,7 +4,7 @@
 from numba import float64
 
 from skglm.datafits.base import BaseDatafit
-from skglm.utils import spectral_norm2
+from skglm.utils import spectral_norm
 
 
 class Quadratic(BaseDatafit):
@@ -59,7 +59,7 @@ def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
         self.Xty = np.zeros(n_features, dtype=X_data.dtype)
 
-        self.global_lipschitz = spectral_norm2(X_data, X_indptr, X_indices, len(y))
+        self.global_lipschitz = spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2
         self.global_lipschitz /= len(y)
 
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
@@ -160,7 +160,7 @@ def initialize(self, X, y):
     def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
 
-        self.global_lipschitz = spectral_norm2(X_data, X_indptr, X_indices, len(y))
+        self.global_lipschitz = spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2
         self.global_lipschitz /= (len(y) * 4)
 
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
@@ -242,8 +242,8 @@ def initialize(self, yXT, y):
     def initialize_sparse(self, yXT_data, yXT_indptr, yXT_indices, y):
         n_features = len(yXT_indptr) - 1
 
-        self.global_lipschitz = spectral_norm2(
-            yXT_data, yXT_indptr, yXT_indices, len(y))
+        self.global_lipschitz = spectral_norm(
+            yXT_data, yXT_indptr, yXT_indices, len(y)) ** 2
 
         self.lipschitz = np.zeros(n_features, dtype=yXT_data.dtype)
         for j in range(n_features):
@@ -336,7 +336,7 @@ def initialize(self, X, y):
     def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
 
-        self.global_lipschitz = spectral_norm2(X_data, X_indptr, X_indices, len(y))
+        self.global_lipschitz = spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2
         self.global_lipschitz /= len(y)
 
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py
index 528fe690a..559c5d9ca 100644
--- a/skglm/tests/test_fista.py
+++ b/skglm/tests/test_fista.py
@@ -4,12 +4,13 @@
 from numpy.linalg import norm
 
 import scipy.sparse
+import scipy.sparse.linalg
 from scipy.sparse import csc_matrix, issparse
 
 from skglm.penalties import L1, IndicatorBox
 from skglm.solvers import FISTA, AndersonCD
 from skglm.datafits import Quadratic, Logistic, QuadraticSVC
-from skglm.utils import make_correlated_data, compiled_clone, spectral_norm2
+from skglm.utils import make_correlated_data, compiled_clone, spectral_norm
 
 
 random_state = 113
@@ -52,15 +53,16 @@ def test_fista_solver(X, Datafit, Penalty):
     np.testing.assert_allclose(w_fista, w_cd, atol=1e-7)
 
 
-def test_spectral_norm2():
+def test_spectral_norm():
     n_samples, n_features = 50, 60
-    A_sparse = scipy.sparse.random(n_samples, n_features, density=0.5, format='csc')
+    A_sparse = scipy.sparse.random(n_samples, n_features, density=0.7, format='csc',
+                                   random_state=random_state)
 
     A_bundles = (A_sparse.data, A_sparse.indptr, A_sparse.indices)
-    lipschitz_our = spectral_norm2(*A_bundles, n_samples=len(y))
-    lipschitz_np = norm(A_sparse.toarray(), ord=2) ** 2
+    spectral_norm_our = spectral_norm(*A_bundles, n_samples=len(y))
+    spectral_norm_sp = scipy.sparse.linalg.svds(A_sparse, k=1)[1]
 
-    np.testing.assert_allclose(lipschitz_our, lipschitz_np)
+    np.testing.assert_allclose(spectral_norm_our, spectral_norm_sp)
 
 
 if __name__ == '__main__':
diff --git a/skglm/utils.py b/skglm/utils.py
index 4e563bad0..c66185ef4 100644
--- a/skglm/utils.py
+++ b/skglm/utils.py
@@ -470,22 +470,22 @@ def _prox_vec(w, z, penalty, lipschitz):
 
 
 @njit
-def spectral_norm2(X_data, X_indptr, X_indices, n_samples,
-                   max_iter=20, tol=1e-6):
-    """Compute the squared spectral norm of sparse matrix ``X``.
+def spectral_norm(X_data, X_indptr, X_indices, n_samples,
+                  max_iter=20, tol=1e-6):
+    """Compute the spectral norm of sparse matrix ``X``.
 
     Find the largest eigenvalue of ``X @ X.T`` using the power method.
 
     Parameters
     ----------
     X_data : array, shape (n_elements,)
-         `data` attribute of the sparse CSC matrix ``X``.
+         ``data`` attribute of the sparse CSC matrix ``X``.
 
     X_indptr : array, shape (n_features + 1,)
-         `indptr` attribute of the sparse CSC matrix ``X``.
+         ``indptr`` attribute of the sparse CSC matrix ``X``.
 
     X_indices : array, shape (n_elements,)
-         `indices` attribute of the sparse CSC matrix ``X``.
+         ``indices`` attribute of the sparse CSC matrix ``X``.
 
     n_samples : int
         number of rows of ``X``.
@@ -499,7 +499,7 @@ def spectral_norm2(X_data, X_indptr, X_indices, n_samples,
     Returns
     -------
     eigenvalue : float
-        The largest eigenvalue of ``X.T @ X``, aka the squared spectral norm of ``X``.
+        The largest singular value of ``X``.
 
     References
     ----------
@@ -526,7 +526,7 @@ def spectral_norm2(X_data, X_indptr, X_indices, n_samples,
 
         eigenvector = vec / norm_vec
 
-    return eigenvalue
+    return np.sqrt(eigenvalue)
 
 
 @njit
@@ -567,4 +567,26 @@ def _XT_dot_vec(X_data, X_indptr, X_indices, vec):
 
 
 if __name__ == '__main__':
+    import time
+    import scipy.sparse.linalg
+    from scipy.sparse import csc_matrix, random
+    n_samples, n_features = 500, 600
+    A = random(n_samples, n_features, density=0.5, format='csc')
+
+    X = csc_matrix(A)
+    X_dense = X.toarray()
+
+    # cache numba compilation
+    M = random(5, 7, density=0.9, format='csc')
+    spectral_norm(M.data, M.indptr, M.indices, 5)
+
+    start = time.time()
+    spectral_norm(X.data, X.indptr, X.indices, n_samples)
+    end = time.time()
+    print("our: ", end - start)
+
+    start = time.time()
+    scipy.sparse.linalg.svds(X, k=1)[1]
+    end = time.time()
+    print("scipy: ", end - start)
     pass

From 27b918dca8c66bf4405645661e9191dee4f9e2f2 Mon Sep 17 00:00:00 2001
From: Badr-MOUFAD <badr.moufad@emines.um6p.ma>
Date: Fri, 21 Oct 2022 14:11:32 +0200
Subject: [PATCH 26/29] refactor ``_prox_vec`` function

---
 skglm/solvers/fista.py |  8 ++++----
 skglm/utils.py         | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
index 417af1a7e..bbdc451b7 100644
--- a/skglm/solvers/fista.py
+++ b/skglm/solvers/fista.py
@@ -8,8 +8,6 @@
 class FISTA(BaseSolver):
     r"""ISTA solver with Nesterov acceleration (FISTA).
 
-    This solver implements accelerated proximal gradient descent for linear problems.
-
     Attributes
     ----------
     max_iter : int, default 100
@@ -59,8 +57,10 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
                     X.data, X.indptr, X.indices, y, z, X @ z, datafit, all_features)
             else:
                 grad = construct_grad(X, y, z, X @ z, datafit, all_features)
-            z -= grad / lipschitz
-            w = _prox_vec(w, z, penalty, lipschitz)
+
+            step = 1 / lipschitz
+            z -= step * grad
+            w = _prox_vec(w, z, penalty, step)
             Xw = X @ w
             z = w + (t_old - 1.) / t_new * (w - w_old)
 
diff --git a/skglm/utils.py b/skglm/utils.py
index c66185ef4..b6ba50a50 100644
--- a/skglm/utils.py
+++ b/skglm/utils.py
@@ -460,12 +460,12 @@ def extrapolate(self, w, Xw):
 
 
 @njit
-def _prox_vec(w, z, penalty, lipschitz):
-    # evaluate the vectorized proximal operator for the FISTA solver
-    # lipschitz stands for global lipschitz constant
+def _prox_vec(w, z, penalty, step):
+    # evaluate the vectorized proximal operator
+    # lipchitz stands for global lipchitz constant
     n_features = w.shape[0]
     for j in range(n_features):
-        w[j] = penalty.prox_1d(z[j], 1 / lipschitz, j)
+        w[j] = penalty.prox_1d(z[j], step, j)
     return w
 
 
@@ -474,7 +474,7 @@ def spectral_norm(X_data, X_indptr, X_indices, n_samples,
                   max_iter=20, tol=1e-6):
     """Compute the spectral norm of sparse matrix ``X``.
 
-    Find the largest eigenvalue of ``X @ X.T`` using the power method.
+    Apply The power method on ``X @ X.T`` to find its largest eigenvalue.
 
     Parameters
     ----------

From 9d8e3c0d1ab92f9cd631b02241a7cd832c0b771e Mon Sep 17 00:00:00 2001
From: Badr-MOUFAD <badr.moufad@emines.um6p.ma>
Date: Fri, 21 Oct 2022 17:48:17 +0200
Subject: [PATCH 27/29] fix bug segmentation fault

---
 skglm/datafits/single_task.py  |  2 +-
 skglm/tests/test_estimators.py |  8 ++------
 skglm/utils.py                 | 25 +------------------------
 3 files changed, 4 insertions(+), 31 deletions(-)

diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
index 63047f6fe..09b19beb1 100644
--- a/skglm/datafits/single_task.py
+++ b/skglm/datafits/single_task.py
@@ -243,7 +243,7 @@ def initialize_sparse(self, yXT_data, yXT_indptr, yXT_indices, y):
         n_features = len(yXT_indptr) - 1
 
         self.global_lipschitz = spectral_norm(
-            yXT_data, yXT_indptr, yXT_indices, len(y)) ** 2
+            yXT_data, yXT_indptr, yXT_indices, max(yXT_indices)+1) ** 2
 
         self.lipschitz = np.zeros(n_features, dtype=yXT_data.dtype)
         for j in range(n_features):
diff --git a/skglm/tests/test_estimators.py b/skglm/tests/test_estimators.py
index 348c5ef67..9e1e1cd71 100644
--- a/skglm/tests/test_estimators.py
+++ b/skglm/tests/test_estimators.py
@@ -25,12 +25,8 @@
 from skglm.solvers import AndersonCD
 
 
-n_samples = 50
-n_tasks = 9
-n_features = 60
-X, Y, _ = make_correlated_data(
-    n_samples=n_samples, n_features=n_features, n_tasks=n_tasks, density=0.1,
-    random_state=0)
+n_samples, n_features, n_tasks = 50, 60, 9
+X, Y, _ = make_correlated_data(n_samples, n_features, n_tasks, random_state=0)
 y = Y[:, 0]
 
 np.random.seed(0)
diff --git a/skglm/utils.py b/skglm/utils.py
index b6ba50a50..ed982fade 100644
--- a/skglm/utils.py
+++ b/skglm/utils.py
@@ -533,8 +533,7 @@ def spectral_norm(X_data, X_indptr, X_indices, n_samples,
 def _XXT_dot_vec(X_data, X_indptr, X_indices, vec, n_samples):
     # computes X @ X.T @ vec, with X csc encoded
     return _X_dot_vec(X_data, X_indptr, X_indices,
-                      _XT_dot_vec(X_data, X_indptr, X_indices, vec),
-                      n_samples)
+                      _XT_dot_vec(X_data, X_indptr, X_indices, vec), n_samples)
 
 
 @njit
@@ -567,26 +566,4 @@ def _XT_dot_vec(X_data, X_indptr, X_indices, vec):
 
 
 if __name__ == '__main__':
-    import time
-    import scipy.sparse.linalg
-    from scipy.sparse import csc_matrix, random
-    n_samples, n_features = 500, 600
-    A = random(n_samples, n_features, density=0.5, format='csc')
-
-    X = csc_matrix(A)
-    X_dense = X.toarray()
-
-    # cache numba compilation
-    M = random(5, 7, density=0.9, format='csc')
-    spectral_norm(M.data, M.indptr, M.indices, 5)
-
-    start = time.time()
-    spectral_norm(X.data, X.indptr, X.indices, n_samples)
-    end = time.time()
-    print("our: ", end - start)
-
-    start = time.time()
-    scipy.sparse.linalg.svds(X, k=1)[1]
-    end = time.time()
-    print("scipy: ", end - start)
     pass

From e5ce21bf3704ab5d38d26b323758d888b10ad618 Mon Sep 17 00:00:00 2001
From: Badr-MOUFAD <badr.moufad@emines.um6p.ma>
Date: Fri, 21 Oct 2022 18:01:42 +0200
Subject: [PATCH 28/29] add Fista to docs && fix unittest

---
 doc/api.rst                    | 1 +
 skglm/tests/test_estimators.py | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/doc/api.rst b/doc/api.rst
index 4d003891e..d05ed8a57 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -70,6 +70,7 @@ Solvers
    :toctree: generated/
 
    AndersonCD
+   FISTA
    GramCD
    GroupBCD
    MultiTaskBCD
diff --git a/skglm/tests/test_estimators.py b/skglm/tests/test_estimators.py
index 9e1e1cd71..348c5ef67 100644
--- a/skglm/tests/test_estimators.py
+++ b/skglm/tests/test_estimators.py
@@ -25,8 +25,12 @@
 from skglm.solvers import AndersonCD
 
 
-n_samples, n_features, n_tasks = 50, 60, 9
-X, Y, _ = make_correlated_data(n_samples, n_features, n_tasks, random_state=0)
+n_samples = 50
+n_tasks = 9
+n_features = 60
+X, Y, _ = make_correlated_data(
+    n_samples=n_samples, n_features=n_features, n_tasks=n_tasks, density=0.1,
+    random_state=0)
 y = Y[:, 0]
 
 np.random.seed(0)

From 5d2dbaf2504347701bf070f77abbe944765c705b Mon Sep 17 00:00:00 2001
From: mathurinm <mathurin.massias@gmail.com>
Date: Sat, 22 Oct 2022 16:23:58 +0200
Subject: [PATCH 29/29] cosmetic changes

---
 skglm/datafits/single_task.py |  2 +-
 skglm/utils.py                | 18 ++++++------------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
index 09b19beb1..948cd1454 100644
--- a/skglm/datafits/single_task.py
+++ b/skglm/datafits/single_task.py
@@ -161,7 +161,7 @@ def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
 
         self.global_lipschitz = spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2
-        self.global_lipschitz /= (len(y) * 4)
+        self.global_lipschitz /= 4 * len(y)
 
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
         for j in range(n_features):
diff --git a/skglm/utils.py b/skglm/utils.py
index ed982fade..2932c1822 100644
--- a/skglm/utils.py
+++ b/skglm/utils.py
@@ -461,8 +461,7 @@ def extrapolate(self, w, Xw):
 
 @njit
 def _prox_vec(w, z, penalty, step):
-    # evaluate the vectorized proximal operator
-    # lipchitz stands for global lipchitz constant
+    # evaluate the full proximal operator
     n_features = w.shape[0]
     for j in range(n_features):
         w[j] = penalty.prox_1d(z[j], step, j)
@@ -471,10 +470,8 @@ def _prox_vec(w, z, penalty, step):
 
 @njit
 def spectral_norm(X_data, X_indptr, X_indices, n_samples,
-                  max_iter=20, tol=1e-6):
-    """Compute the spectral norm of sparse matrix ``X``.
-
-    Apply The power method on ``X @ X.T`` to find its largest eigenvalue.
+                  max_iter=100, tol=1e-6):
+    """Compute the spectral norm of sparse matrix ``X`` with power method.
 
     Parameters
     ----------
@@ -504,11 +501,8 @@ def spectral_norm(X_data, X_indptr, X_indices, n_samples,
     References
     ----------
     .. [1] Alfio Quarteroni, Riccardo Sacco, Fausto Saleri "Numerical Mathematics",
-        chapiter 5, page 192-195.
+        chapter 5, page 192-195.
     """
-    # tol is squared as we evaluate the square of inequality (5.25) in ref [1]
-    tol = tol ** 2
-
     # init vec with norm(vec) == 1.
     eigenvector = np.random.randn(n_samples)
     eigenvector /= norm(eigenvector)
@@ -520,8 +514,8 @@ def spectral_norm(X_data, X_indptr, X_indices, n_samples,
         eigenvalue = vec @ eigenvector
 
         # norm(X @ X.T @ eigenvector - eigenvalue * eigenvector) <= tol
-        # inequality is squared
-        if norm_vec ** 2 - eigenvalue ** 2 <= tol:
+        # inequality (5.25) in ref [1] is squared
+        if norm_vec ** 2 - eigenvalue ** 2 <= tol ** 2:
             break
 
         eigenvector = vec / norm_vec