From e277f4100c5488d428b53803f1492a5a8733739f Mon Sep 17 00:00:00 2001
From: mattplo <plotton.matthieu@gmail.com>
Date: Wed, 7 Jul 2021 12:41:01 +0200
Subject: [PATCH 1/9] LMO implementation & basic tests

---
 Project.toml         |  1 +
 src/afw.jl           | 12 ++++----
 src/blended_cg.jl    |  8 +++---
 src/fw_algorithms.jl | 12 ++++----
 src/oracles.jl       | 66 +++++++++++++++++++++++++++++++++++++++++---
 test/lmo.jl          | 46 ++++++++++++++++++++++++++++++
 6 files changed, 125 insertions(+), 20 deletions(-)

diff --git a/Project.toml b/Project.toml
index 07dd95d94..919235c1a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,6 +7,7 @@ version = "0.1.4"
 Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
 DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+GLPK = "60bf3e95-4087-53dc-ae20-288a0d20c6a6"
 Hungarian = "e91730f6-4275-51fb-a7a0-7064cfbd3b39"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
diff --git a/src/afw.jl b/src/afw.jl
index 904a65584..6d98208d0 100644
--- a/src/afw.jl
+++ b/src/afw.jl
@@ -89,7 +89,7 @@ function away_frank_wolfe(
 
     x = compute_active_set_iterate(active_set)
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient,x=x)
     phi_value = fast_dot(x, gradient) - fast_dot(v, gradient)
 
     while t <= max_iteration && dual_gap >= max(epsilon, eps())
@@ -216,7 +216,7 @@ function away_frank_wolfe(
     if verbose
         x = compute_active_set_iterate(active_set)
         grad!(gradient, x)
-        v = compute_extreme_point(lmo, gradient)
+        v = compute_extreme_point(lmo, gradient,x=x)
         primal = f(x)
         dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
         tt = last
@@ -238,7 +238,7 @@ function away_frank_wolfe(
     active_set_cleanup!(active_set)
     x = compute_active_set_iterate(active_set)
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient, x=x)
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
     if verbose
@@ -289,7 +289,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
             index = a_loc
             #Resort to calling the LMO
         else
-            v = compute_extreme_point(lmo, gradient)
+            v = compute_extreme_point(lmo, gradient, x=x)
             # Real dual gap promises enough progress.
             grad_dot_fw_vertex = fast_dot(v, gradient)
             dual_gap = grad_dot_x - grad_dot_fw_vertex
@@ -321,7 +321,7 @@ function afw_step(x, gradient, lmo, active_set)
     local_v_lambda, local_v, local_v_loc, a_lambda, a, a_loc =
         active_set_argminmax(active_set, gradient)
     away_gap = fast_dot(a, gradient) - fast_dot(x, gradient)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient, x=x)
     grad_dot_x = fast_dot(x, gradient)
     away_gap = fast_dot(a, gradient) - grad_dot_x
     dual_gap = grad_dot_x - fast_dot(v, gradient)
@@ -346,7 +346,7 @@ function afw_step(x, gradient, lmo, active_set)
 end
 
 function fw_step(x, gradient, lmo)
-    vertex = compute_extreme_point(lmo, gradient)
+    vertex = compute_extreme_point(lmo, gradient, x=x)
     return (
         x - vertex,
         vertex,
diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 3b0ca1769..60d34a628 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -52,7 +52,7 @@ function blended_conditional_gradient(
     primal = f(x)
     grad!(gradient, x)
     # initial gap estimate computation
-    vmax = compute_extreme_point(lmo, gradient)
+    vmax = compute_extreme_point(lmo, gradient, x=x)
     phi = fast_dot(gradient, x0 - vmax) / 2
     dual_gap = phi
     traj_data = []
@@ -247,7 +247,7 @@ function blended_conditional_gradient(
     if verbose
         x = compute_active_set_iterate(active_set)
         grad!(gradient, x)
-        v = compute_extreme_point(lmo, gradient)
+        v = compute_extreme_point(lmo, gradient, x=x)
         primal = f(x)
         dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
         tot_time = (time_ns() - time_start) / 1e9
@@ -271,7 +271,7 @@ function blended_conditional_gradient(
     active_set_renormalize!(active_set)
     x = compute_active_set_iterate(active_set)
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient, x=x)
     primal = f(x)
     #dual_gap = 2phi
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
@@ -1029,7 +1029,7 @@ function lp_separation_oracle(
         end
     end
     # otherwise, call the LMO
-    y = compute_extreme_point(lmo, direction; kwargs...)
+    y = compute_extreme_point(lmo, direction; x=x, kwargs...)
     # don't return nothing but y, fast_dot(direction, y) / use y for step outside / and update phi as in LCG (lines 402 - 406)
     return (y, fast_dot(direction, y))
 end
diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index d8feb368a..ffc02c098 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -132,7 +132,7 @@ function frank_wolfe(
         end
         first_iter = false
 
-        v = compute_extreme_point(lmo, gradient)
+        v = compute_extreme_point(lmo, gradient,x=x)
         # go easy on the memory - only compute if really needed
         if (
             (mod(t, print_iter) == 0 && verbose) ||
@@ -201,7 +201,7 @@ function frank_wolfe(
     # hence the final computation.
 
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient,x=x)
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
     if verbose
@@ -347,7 +347,7 @@ function lazified_conditional_gradient(
             primal = f(x)
         end
 
-        v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy)
+        v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy,x=x)
         tt = lazy
         if fast_dot(v, gradient) > threshold
             tt = dualstep
@@ -413,7 +413,7 @@ function lazified_conditional_gradient(
     # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
     # hence the final computation.
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient,x=x)
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
 
@@ -559,7 +559,7 @@ function stochastic_frank_wolfe(
         end
         first_iter = false
 
-        v = compute_extreme_point(lmo, gradient)
+        v = compute_extreme_point(lmo, gradient,x=x)
 
         # go easy on the memory - only compute if really needed
         if (mod(t, print_iter) == 0 && verbose) ||
@@ -622,7 +622,7 @@ function stochastic_frank_wolfe(
     # last computation done with full evaluation for exact gradient
 
     (primal, gradient) = compute_value_gradient(f, x, full_evaluation=true)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient,x=x)
     # @show (gradient, primal)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
     if verbose
diff --git a/src/oracles.jl b/src/oracles.jl
index bdcb387ca..c3936d8f6 100644
--- a/src/oracles.jl
+++ b/src/oracles.jl
@@ -158,7 +158,7 @@ function compute_extreme_point(
                 end
             end
         end
-        if best_idx > 0 # && fast_dot(best_v, direction) ≤ threshold 
+        if best_idx > 0 # && fast_dot(best_v, direction) ≤ threshold
             # println("cache sol")
             return best_v
         end
@@ -241,10 +241,10 @@ function compute_extreme_point(
     if best_idx < 0
         v = compute_extreme_point(lmo.inner, direction)
         if store_cache
-            # note: we do not check for duplicates. hence you might end up with more vertices, 
+            # note: we do not check for duplicates. hence you might end up with more vertices,
             # in fact up to number of dual steps many, that might be already in the cache
-            # in order to reach this point, if v was already in the cache is must not meet the threshold (otherwise we would have returned it) 
-            # and it is the best possible, hence we will perform a dual step on the outside. 
+            # in order to reach this point, if v was already in the cache is must not meet the threshold (otherwise we would have returned it)
+            # and it is the best possible, hence we will perform a dual step on the outside.
             #
             # note: another possibility could be to test against that in the if statement but then you might end you recalculating the same vertex a few times.
             # as such this might be a better tradeoff, i.e., to not check the set for duplicates and potentially accept #dualSteps many duplicates.
@@ -303,3 +303,61 @@ function compute_extreme_point(
     end
     return storage
 end
+
+"""
+"""
+struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T} <: LinearMinimizationOracle
+    inner::LMO
+    max_rounds_number::Int
+    improv_tol::T
+end
+
+"""
+"""
+function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...)
+    d = 0 * direction
+    Λ = 0
+    #@debug x
+    #@debug direction
+    if norm(direction) == 0
+        return compute_extreme_point(lmo.inner, direction, x=x, kwargs...)
+    end
+    flag = true
+    function align(d1, d2)
+        if norm(d1) == 0 || norm(d2) == 0
+            return -1
+        else
+            return fast_dot(d1, d2) / (norm(d1) * norm(d2))
+        end
+    end
+    residual = -direction - d
+    for round in 1:lmo.max_rounds_number
+        @debug round
+        #@debug residual
+        v = compute_extreme_point(lmo.inner, -residual; x=x, kwargs...)
+        u = v - x
+        #@debug v
+        #@debug v - x
+        d_norm = norm(d)
+        if d_norm > 0 && fast_dot(-residual, -d / d_norm) < fast_dot(-residual, u)
+            u = -d / d_norm
+            flag = false
+        end
+        λ = fast_dot(residual, u) / (norm(u)^2)
+        d_updated = d + λ * u
+        if align(-direction, d_updated) - align(-direction, d) >= lmo.improv_tol
+            d = d_updated
+            residual = -direction - d
+            if flag
+                Λ += λ
+            else
+                Λ = Λ * (1 - λ / d_norm)
+            end
+            flag = true
+        else
+            break
+        end
+    end
+    #@debug x + d / Λ
+    return x + d / Λ
+end
diff --git a/test/lmo.jl b/test/lmo.jl
index 4cbd92a14..98af39504 100644
--- a/test/lmo.jl
+++ b/test/lmo.jl
@@ -532,3 +532,49 @@ end
     vvec = FrankWolfe.compute_extreme_point(lmo, [dinf; d1]; direction_indices=(1:10, 11:15))
     @test vvec ≈ [vinf; v1]
 end
+
+@testset "ChasingGradient LMO" begin
+    max_rounds = 100
+    improv_tol = 10e-3
+    f(x) = norm(x)^2
+    function grad!(storage, x)
+        return storage .= 2x
+    end
+    lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1)
+    lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol)
+    x0 = FrankWolfe.compute_extreme_point(lmo_prob, zeros(5))
+    @show x0
+    res = FrankWolfe.frank_wolfe(
+        f,
+        grad!,
+        lmo,
+        x0,
+        max_iteration=10,
+        line_search=FrankWolfe.Agnostic(),
+        verbose=true,
+    )
+    @show res
+end
+
+@testset "ChasingGradient LMO 2" begin
+    max_rounds = 100
+    improv_tol = 10e-3
+    f(x) = norm(x)^2
+    function grad!(storage, x)
+        return storage .= 2x
+    end
+    lmo_norm = FrankWolfe.LpNormLMO{1}(1)
+    lmo = FrankWolfe.ChasingGradientLMO(lmo_norm, max_rounds, improv_tol)
+    x0 = ones(5)
+    @show x0
+    res = FrankWolfe.frank_wolfe(
+        f,
+        grad!,
+        lmo,
+        x0,
+        max_iteration=10,
+        line_search=FrankWolfe.Agnostic(),
+        verbose=true,
+    )
+    @show res
+end

From c56a2b417c46bf7b61911f4bbbc031fa1ee02fe4 Mon Sep 17 00:00:00 2001
From: mattplo <plotton.matthieu@gmail.com>
Date: Thu, 8 Jul 2021 11:04:26 +0200
Subject: [PATCH 2/9] fixes, add kwargs

---
 Project.toml         |  1 -
 src/afw.jl           | 21 +++++++++---------
 src/blended_cg.jl    |  6 ++---
 src/fw_algorithms.jl | 22 ++++++++++++++-----
 src/oracles.jl       | 37 ++++++++++++++++++-------------
 test/lmo.jl          | 52 ++++++++++++++++++--------------------------
 6 files changed, 73 insertions(+), 66 deletions(-)

diff --git a/Project.toml b/Project.toml
index 919235c1a..07dd95d94 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,7 +7,6 @@ version = "0.1.4"
 Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
 DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
-GLPK = "60bf3e95-4087-53dc-ae20-288a0d20c6a6"
 Hungarian = "e91730f6-4275-51fb-a7a0-7064cfbd3b39"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
diff --git a/src/afw.jl b/src/afw.jl
index 6d98208d0..47435686b 100644
--- a/src/afw.jl
+++ b/src/afw.jl
@@ -32,6 +32,7 @@ function away_frank_wolfe(
     callback=nothing,
     timeout=Inf,
     print_callback=FrankWolfe.print_callback,
+    kwargs...,
 )
 
     # format string for output of the algorithm
@@ -89,7 +90,7 @@ function away_frank_wolfe(
 
     x = compute_active_set_iterate(active_set)
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient,x=x)
+    v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
     phi_value = fast_dot(x, gradient) - fast_dot(v, gradient)
 
     while t <= max_iteration && dual_gap >= max(epsilon, eps())
@@ -216,7 +217,7 @@ function away_frank_wolfe(
     if verbose
         x = compute_active_set_iterate(active_set)
         grad!(gradient, x)
-        v = compute_extreme_point(lmo, gradient,x=x)
+        v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
         primal = f(x)
         dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
         tt = last
@@ -238,7 +239,7 @@ function away_frank_wolfe(
     active_set_cleanup!(active_set)
     x = compute_active_set_iterate(active_set)
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient, x=x)
+    v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
     if verbose
@@ -263,7 +264,7 @@ end
 
 function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
     v_lambda, v, v_loc, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient)
-    #Do lazy FW step
+    # Do lazy FW step
     grad_dot_lazy_fw_vertex = fast_dot(v, gradient)
     grad_dot_x = fast_dot(x, gradient)
     grad_dot_a = fast_dot(a, gradient)
@@ -277,7 +278,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
         fw_step_taken = true
         index = v_loc
     else
-        #Do away step, as it promises enough progress.
+        # Do away step, as it promises enough progress.
         if grad_dot_a - grad_dot_x > grad_dot_x - grad_dot_lazy_fw_vertex &&
            grad_dot_a - grad_dot_x >= phi / K
             tt = away
@@ -287,9 +288,9 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
             away_step_taken = true
             fw_step_taken = false
             index = a_loc
-            #Resort to calling the LMO
+            # Resort to calling the LMO
         else
-            v = compute_extreme_point(lmo, gradient, x=x)
+            v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
             # Real dual gap promises enough progress.
             grad_dot_fw_vertex = fast_dot(v, gradient)
             dual_gap = grad_dot_x - grad_dot_fw_vertex
@@ -301,7 +302,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
                 away_step_taken = false
                 fw_step_taken = true
                 index = nothing
-                #Lower our expectation for progress.
+                # Lower our expectation for progress.
             else
                 tt = dualstep
                 phi = min(dual_gap, phi / 2.0)
@@ -321,7 +322,7 @@ function afw_step(x, gradient, lmo, active_set)
     local_v_lambda, local_v, local_v_loc, a_lambda, a, a_loc =
         active_set_argminmax(active_set, gradient)
     away_gap = fast_dot(a, gradient) - fast_dot(x, gradient)
-    v = compute_extreme_point(lmo, gradient, x=x)
+    v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
     grad_dot_x = fast_dot(x, gradient)
     away_gap = fast_dot(a, gradient) - grad_dot_x
     dual_gap = grad_dot_x - fast_dot(v, gradient)
@@ -346,7 +347,7 @@ function afw_step(x, gradient, lmo, active_set)
 end
 
 function fw_step(x, gradient, lmo)
-    vertex = compute_extreme_point(lmo, gradient, x=x)
+    vertex = compute_extreme_point(lmo, gradient, x=x, kwargs...)
     return (
         x - vertex,
         vertex,
diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 60d34a628..8fe2b47cc 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -52,7 +52,7 @@ function blended_conditional_gradient(
     primal = f(x)
     grad!(gradient, x)
     # initial gap estimate computation
-    vmax = compute_extreme_point(lmo, gradient, x=x)
+    vmax = compute_extreme_point(lmo, gradient, x=x, lmo_kwargs...)
     phi = fast_dot(gradient, x0 - vmax) / 2
     dual_gap = phi
     traj_data = []
@@ -247,7 +247,7 @@ function blended_conditional_gradient(
     if verbose
         x = compute_active_set_iterate(active_set)
         grad!(gradient, x)
-        v = compute_extreme_point(lmo, gradient, x=x)
+        v = compute_extreme_point(lmo, gradient, x=x, lmo_kwargs...)
         primal = f(x)
         dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
         tot_time = (time_ns() - time_start) / 1e9
@@ -271,7 +271,7 @@ function blended_conditional_gradient(
     active_set_renormalize!(active_set)
     x = compute_active_set_iterate(active_set)
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient, x=x)
+    v = compute_extreme_point(lmo, gradient, x=x, lmo_kwargs...)
     primal = f(x)
     #dual_gap = 2phi
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index ffc02c098..b4e976570 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -31,6 +31,7 @@ function frank_wolfe(
     callback=nothing,
     timeout=Inf,
     print_callback=FrankWolfe.print_callback,
+    kwargs...,
 )
 
     # format string for output of the algorithm
@@ -132,7 +133,7 @@ function frank_wolfe(
         end
         first_iter = false
 
-        v = compute_extreme_point(lmo, gradient,x=x)
+        v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
         # go easy on the memory - only compute if really needed
         if (
             (mod(t, print_iter) == 0 && verbose) ||
@@ -201,7 +202,7 @@ function frank_wolfe(
     # hence the final computation.
 
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient,x=x)
+    v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
     if verbose
@@ -256,6 +257,7 @@ function lazified_conditional_gradient(
     callback=nothing,
     timeout=Inf,
     print_callback=FrankWolfe.print_callback,
+    kwargs...,
 )
 
     # format string for output of the algorithm
@@ -347,7 +349,14 @@ function lazified_conditional_gradient(
             primal = f(x)
         end
 
-        v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy,x=x)
+        v = compute_extreme_point(
+            lmo,
+            gradient,
+            threshold=threshold,
+            greedy=greedy_lazy,
+            x=x,
+            kwargs...,
+        )
         tt = lazy
         if fast_dot(v, gradient) > threshold
             tt = dualstep
@@ -413,7 +422,7 @@ function lazified_conditional_gradient(
     # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
     # hence the final computation.
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient,x=x)
+    v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
 
@@ -465,6 +474,7 @@ function stochastic_frank_wolfe(
     callback=nothing,
     timeout=Inf,
     print_callback=FrankWolfe.print_callback,
+    kwargs...,
 )
 
     # format string for output of the algorithm
@@ -559,7 +569,7 @@ function stochastic_frank_wolfe(
         end
         first_iter = false
 
-        v = compute_extreme_point(lmo, gradient,x=x)
+        v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
 
         # go easy on the memory - only compute if really needed
         if (mod(t, print_iter) == 0 && verbose) ||
@@ -622,7 +632,7 @@ function stochastic_frank_wolfe(
     # last computation done with full evaluation for exact gradient
 
     (primal, gradient) = compute_value_gradient(f, x, full_evaluation=true)
-    v = compute_extreme_point(lmo, gradient,x=x)
+    v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
     # @show (gradient, primal)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
     if verbose
diff --git a/src/oracles.jl b/src/oracles.jl
index c3936d8f6..2223c9cb9 100644
--- a/src/oracles.jl
+++ b/src/oracles.jl
@@ -305,6 +305,14 @@ function compute_extreme_point(
 end
 
 """
+    ChasingGradientLMO{LMO,T}
+
+Oracle wrapping another one of type LMO to boost FW with gradient pursuit.
+Pursuit rounds end once the alignment improvement with the direction of the gradient 
+is lower than `improv_tol` or `max_rounds_number` is reached.
+See the [paper](https://arxiv.org/abs/2003.06369)
+
+All keyword arguments are passed to the inner LMO.
 """
 struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T} <: LinearMinimizationOracle
     inner::LMO
@@ -312,32 +320,27 @@ struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T} <: LinearMinimization
     improv_tol::T
 end
 
-"""
-"""
 function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...)
     d = 0 * direction
-    Λ = 0
-    #@debug x
-    #@debug direction
-    if norm(direction) == 0
+    Λ = 0.0
+    if norm(direction) <= eps(eltype(direction))
         return compute_extreme_point(lmo.inner, direction, x=x, kwargs...)
     end
     flag = true
     function align(d1, d2)
-        if norm(d1) == 0 || norm(d2) == 0
-            return -1
+        if norm(d1) <= eps(eltype(d1)) || norm(d2) <= eps(eltype(d2))
+            return -1.0
         else
             return fast_dot(d1, d2) / (norm(d1) * norm(d2))
         end
     end
     residual = -direction - d
     for round in 1:lmo.max_rounds_number
-        @debug round
-        #@debug residual
-        v = compute_extreme_point(lmo.inner, -residual; x=x, kwargs...)
+        if norm(residual) <= eps(eltype(residual))
+            break
+        end
+        v = compute_extreme_point(lmo.inner, -residual; kwargs...)
         u = v - x
-        #@debug v
-        #@debug v - x
         d_norm = norm(d)
         if d_norm > 0 && fast_dot(-residual, -d / d_norm) < fast_dot(-residual, u)
             u = -d / d_norm
@@ -358,6 +361,10 @@ function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...)
             break
         end
     end
-    #@debug x + d / Λ
-    return x + d / Λ
+
+    if Λ <= eps(eltype(Λ))
+        return x
+    else
+        return x + d / Λ
+    end
 end
diff --git a/test/lmo.jl b/test/lmo.jl
index 98af39504..a22151a51 100644
--- a/test/lmo.jl
+++ b/test/lmo.jl
@@ -533,48 +533,38 @@ end
     @test vvec ≈ [vinf; v1]
 end
 
-@testset "ChasingGradient LMO" begin
-    max_rounds = 100
-    improv_tol = 10e-3
-    f(x) = norm(x)^2
-    function grad!(storage, x)
-        return storage .= 2x
-    end
-    lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1)
-    lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol)
-    x0 = FrankWolfe.compute_extreme_point(lmo_prob, zeros(5))
-    @show x0
-    res = FrankWolfe.frank_wolfe(
-        f,
-        grad!,
-        lmo,
-        x0,
-        max_iteration=10,
-        line_search=FrankWolfe.Agnostic(),
-        verbose=true,
-    )
-    @show res
-end
 
-@testset "ChasingGradient LMO 2" begin
+@testset "Chasing Gradient LMO" begin
     max_rounds = 100
     improv_tol = 10e-3
-    f(x) = norm(x)^2
+    xp = ones(5)
+    f(x) = norm(x - xp)^2
     function grad!(storage, x)
-        return storage .= 2x
+        @. storage = 2 * (x - xp)
+        return nothing
     end
-    lmo_norm = FrankWolfe.LpNormLMO{1}(1)
+    lmo_norm = FrankWolfe.LpNormLMO{Float64,1}(1)
     lmo = FrankWolfe.ChasingGradientLMO(lmo_norm, max_rounds, improv_tol)
-    x0 = ones(5)
-    @show x0
-    res = FrankWolfe.frank_wolfe(
+    x0 = zeros(5)
+    res_boosting = FrankWolfe.frank_wolfe(
         f,
         grad!,
         lmo,
         x0,
-        max_iteration=10,
+        max_iteration=1,
         line_search=FrankWolfe.Agnostic(),
         verbose=true,
     )
-    @show res
+    # x0 = zeros(5)
+    # res = FrankWolfe.frank_wolfe(
+    #     f,
+    #     grad!,
+    #     lmo_norm,
+    #     x0,
+    #     max_iteration = 1000,
+    #     line_search = FrankWolfe.Agnostic(),
+    #     verbose = true,
+    # )
+    @test abs(res_boosting[3] - 3.2) < 1.0e-5
+
 end

From d21b90ad651542ad4472338b6e34110b402e35f3 Mon Sep 17 00:00:00 2001
From: mattplo <plotton.matthieu@gmail.com>
Date: Thu, 8 Jul 2021 17:21:01 +0200
Subject: [PATCH 3/9] test with ProbabilitySimplexOracle

---
 src/oracles.jl |  5 +++--
 test/lmo.jl    | 43 ++++++++++++++++++++++++-------------------
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/src/oracles.jl b/src/oracles.jl
index 2223c9cb9..a49f4354f 100644
--- a/src/oracles.jl
+++ b/src/oracles.jl
@@ -314,7 +314,7 @@ See the [paper](https://arxiv.org/abs/2003.06369)
 
 All keyword arguments are passed to the inner LMO.
 """
-struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T} <: LinearMinimizationOracle
+struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T,G} <: LinearMinimizationOracle
     inner::LMO
     max_rounds_number::Int
     improv_tol::T
@@ -337,6 +337,7 @@ function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...)
     residual = -direction - d
     for round in 1:lmo.max_rounds_number
         if norm(residual) <= eps(eltype(residual))
+            #@show round
             break
         end
         v = compute_extreme_point(lmo.inner, -residual; kwargs...)
@@ -358,10 +359,10 @@ function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...)
             end
             flag = true
         else
+            #@show round
             break
         end
     end
-
     if Λ <= eps(eltype(Λ))
         return x
     else
diff --git a/test/lmo.jl b/test/lmo.jl
index a22151a51..1db23a1e1 100644
--- a/test/lmo.jl
+++ b/test/lmo.jl
@@ -535,36 +535,41 @@ end
 
 
 @testset "Chasing Gradient LMO" begin
-    max_rounds = 100
+    max_rounds = 1
     improv_tol = 10e-3
-    xp = ones(5)
+    n = Int(1e5)
+    xpi = rand(n)
+    total = sum(xpi)
+    xp = xpi ./ total
     f(x) = norm(x - xp)^2
     function grad!(storage, x)
         @. storage = 2 * (x - xp)
         return nothing
     end
-    lmo_norm = FrankWolfe.LpNormLMO{Float64,1}(1)
-    lmo = FrankWolfe.ChasingGradientLMO(lmo_norm, max_rounds, improv_tol)
-    x0 = zeros(5)
+    lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1.0)
+    x00 = FrankWolfe.compute_extreme_point(lmo_prob, zeros(n))
+    d=zeros(n)
+    lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol,d)
+    x0 = deepcopy(x00)
     res_boosting = FrankWolfe.frank_wolfe(
         f,
         grad!,
         lmo,
         x0,
-        max_iteration=1,
-        line_search=FrankWolfe.Agnostic(),
+        max_iteration=500,
+        print_iter=50,
+        line_search=FrankWolfe.Adaptive(),
+        verbose=true,
+    )
+    x0 = deepcopy(x00)
+    res = FrankWolfe.frank_wolfe(
+        f,
+        grad!,
+        lmo_prob,
+        x0,
+        max_iteration=500,
+        print_iter=50,
+        line_search=FrankWolfe.Adaptive(),
         verbose=true,
     )
-    # x0 = zeros(5)
-    # res = FrankWolfe.frank_wolfe(
-    #     f,
-    #     grad!,
-    #     lmo_norm,
-    #     x0,
-    #     max_iteration = 1000,
-    #     line_search = FrankWolfe.Agnostic(),
-    #     verbose = true,
-    # )
-    @test abs(res_boosting[3] - 3.2) < 1.0e-5
-
 end

From 86b186fce71d45e716789b567dacd3c4cb2162ae Mon Sep 17 00:00:00 2001
From: mattplo <plotton.matthieu@gmail.com>
Date: Thu, 8 Jul 2021 17:28:23 +0200
Subject: [PATCH 4/9] cleanup

---
 src/oracles.jl | 2 +-
 test/lmo.jl    | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/oracles.jl b/src/oracles.jl
index a49f4354f..75d94e73a 100644
--- a/src/oracles.jl
+++ b/src/oracles.jl
@@ -314,7 +314,7 @@ See the [paper](https://arxiv.org/abs/2003.06369)
 
 All keyword arguments are passed to the inner LMO.
 """
-struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T,G} <: LinearMinimizationOracle
+struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T} <: LinearMinimizationOracle
     inner::LMO
     max_rounds_number::Int
     improv_tol::T
diff --git a/test/lmo.jl b/test/lmo.jl
index 1db23a1e1..d77ce6055 100644
--- a/test/lmo.jl
+++ b/test/lmo.jl
@@ -548,8 +548,7 @@ end
     end
     lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1.0)
     x00 = FrankWolfe.compute_extreme_point(lmo_prob, zeros(n))
-    d=zeros(n)
-    lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol,d)
+    lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol)
     x0 = deepcopy(x00)
     res_boosting = FrankWolfe.frank_wolfe(
         f,

From 4af4f5deefe28972df8cae9cc5f199e11417165e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Thu, 8 Jul 2021 22:25:33 +0200
Subject: [PATCH 5/9] Update src/oracles.jl

---
 src/oracles.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/oracles.jl b/src/oracles.jl
index 75d94e73a..636442c66 100644
--- a/src/oracles.jl
+++ b/src/oracles.jl
@@ -365,7 +365,6 @@ function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...)
     end
     if Λ <= eps(eltype(Λ))
         return x
-    else
-        return x + d / Λ
     end
+    return x + d / Λ
 end

From 87d29d64c80c3de0104acab02dfeb4065095e77f Mon Sep 17 00:00:00 2001
From: mattplo <plotton.matthieu@gmail.com>
Date: Thu, 15 Jul 2021 17:27:10 +0200
Subject: [PATCH 6/9] improvements & tests

---
 src/FrankWolfe.jl      |   3 +-
 src/afw.jl             |  24 +++++-----
 src/blended_cg.jl      |   6 +--
 src/fw_algorithms.jl   |  18 ++++---
 src/oracles.jl         | 106 +++++++++++++++++++++++++++++------------
 src/simplex_oracles.jl |   2 +-
 test/lmo.jl            |  84 ++++++++++++++++++--------------
 test/temp.jl           |  61 ++++++++++++++++++++++++
 8 files changed, 213 insertions(+), 91 deletions(-)
 create mode 100644 test/temp.jl

diff --git a/src/FrankWolfe.jl b/src/FrankWolfe.jl
index 0f5eee244..ccd0054b1 100644
--- a/src/FrankWolfe.jl
+++ b/src/FrankWolfe.jl
@@ -1,5 +1,6 @@
 module FrankWolfe
 
+using Plots: SparseArrays
 using LinearAlgebra
 using Printf
 using ProgressMeter
@@ -41,7 +42,7 @@ include("fw_algorithms.jl")
 
 # collecting most common data types etc and precompile 
 # min version req set to 1.5 to prevent stalling of julia 1
-@static if VERSION >= v"1.5"   
+@static if VERSION >= v"1.5"
     println("Precompiling common signatures. This might take a moment...")
     include("precompile.jl")
 end
diff --git a/src/afw.jl b/src/afw.jl
index 47435686b..92d55f507 100644
--- a/src/afw.jl
+++ b/src/afw.jl
@@ -90,7 +90,7 @@ function away_frank_wolfe(
 
     x = compute_active_set_iterate(active_set)
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
+    v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
     phi_value = fast_dot(x, gradient) - fast_dot(v, gradient)
 
     while t <= max_iteration && dual_gap >= max(epsilon, eps())
@@ -128,14 +128,14 @@ function away_frank_wolfe(
         if away_steps
             if lazy
                 d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
-                    lazy_afw_step(x, gradient, lmo, active_set, phi_value; K=K)
+                    lazy_afw_step(x, gradient, lmo, active_set, phi_value; K=K, kwargs...)
             else
                 d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
-                    afw_step(x, gradient, lmo, active_set)
+                    afw_step(x, gradient, lmo, active_set; kwargs...)
             end
         else
             d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
-                fw_step(x, gradient, lmo)
+                fw_step(x, gradient, lmo; kwargs...)
         end
 
 
@@ -217,7 +217,7 @@ function away_frank_wolfe(
     if verbose
         x = compute_active_set_iterate(active_set)
         grad!(gradient, x)
-        v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
+        v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
         primal = f(x)
         dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
         tt = last
@@ -239,7 +239,7 @@ function away_frank_wolfe(
     active_set_cleanup!(active_set)
     x = compute_active_set_iterate(active_set)
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
+    v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
     if verbose
@@ -262,7 +262,7 @@ function away_frank_wolfe(
     return x, v, primal, dual_gap, traj_data, active_set
 end
 
-function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
+function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0, kwargs...)
     v_lambda, v, v_loc, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient)
     # Do lazy FW step
     grad_dot_lazy_fw_vertex = fast_dot(v, gradient)
@@ -290,7 +290,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
             index = a_loc
             # Resort to calling the LMO
         else
-            v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
+            v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
             # Real dual gap promises enough progress.
             grad_dot_fw_vertex = fast_dot(v, gradient)
             dual_gap = grad_dot_x - grad_dot_fw_vertex
@@ -318,11 +318,11 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
     return d, vertex, index, gamma_max, phi, away_step_taken, fw_step_taken, tt
 end
 
-function afw_step(x, gradient, lmo, active_set)
+function afw_step(x, gradient, lmo, active_set; kwargs...)
     local_v_lambda, local_v, local_v_loc, a_lambda, a, a_loc =
         active_set_argminmax(active_set, gradient)
     away_gap = fast_dot(a, gradient) - fast_dot(x, gradient)
-    v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
+    v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
     grad_dot_x = fast_dot(x, gradient)
     away_gap = fast_dot(a, gradient) - grad_dot_x
     dual_gap = grad_dot_x - fast_dot(v, gradient)
@@ -346,8 +346,8 @@ function afw_step(x, gradient, lmo, active_set)
     return d, vertex, index, gamma_max, dual_gap, away_step_taken, fw_step_taken, tt
 end
 
-function fw_step(x, gradient, lmo)
-    vertex = compute_extreme_point(lmo, gradient, x=x, kwargs...)
+function fw_step(x, gradient, lmo; kwargs...)
+    vertex = compute_extreme_point(lmo, gradient, x=x; kwargs...)
     return (
         x - vertex,
         vertex,
diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 8fe2b47cc..f01af2dbb 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -52,7 +52,7 @@ function blended_conditional_gradient(
     primal = f(x)
     grad!(gradient, x)
     # initial gap estimate computation
-    vmax = compute_extreme_point(lmo, gradient, x=x, lmo_kwargs...)
+    vmax = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...)
     phi = fast_dot(gradient, x0 - vmax) / 2
     dual_gap = phi
     traj_data = []
@@ -247,7 +247,7 @@ function blended_conditional_gradient(
     if verbose
         x = compute_active_set_iterate(active_set)
         grad!(gradient, x)
-        v = compute_extreme_point(lmo, gradient, x=x, lmo_kwargs...)
+        v = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...)
         primal = f(x)
         dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
         tot_time = (time_ns() - time_start) / 1e9
@@ -271,7 +271,7 @@ function blended_conditional_gradient(
     active_set_renormalize!(active_set)
     x = compute_active_set_iterate(active_set)
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient, x=x, lmo_kwargs...)
+    v = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...)
     primal = f(x)
     #dual_gap = 2phi
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index b4e976570..0258fbc59 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -133,7 +133,8 @@ function frank_wolfe(
         end
         first_iter = false
 
-        v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
+        v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
+        #@show typeof(v)
         # go easy on the memory - only compute if really needed
         if (
             (mod(t, print_iter) == 0 && verbose) ||
@@ -173,6 +174,9 @@ function frank_wolfe(
             step_lim,
             one(eltype(x)),
         )
+        #@show typeof(x)
+        #@show typeof(d)
+        #@show typeof(x-gamma*d)
 
         @emphasis(emphasis, x = x - gamma * d)
 
@@ -202,7 +206,9 @@ function frank_wolfe(
     # hence the final computation.
 
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
+    v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
+    #@show v
+    #@show typeof(v)
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
     if verbose
@@ -354,7 +360,7 @@ function lazified_conditional_gradient(
             gradient,
             threshold=threshold,
             greedy=greedy_lazy,
-            x=x,
+            x=x;
             kwargs...,
         )
         tt = lazy
@@ -422,7 +428,7 @@ function lazified_conditional_gradient(
     # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
     # hence the final computation.
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
+    v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
 
@@ -569,7 +575,7 @@ function stochastic_frank_wolfe(
         end
         first_iter = false
 
-        v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
+        v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
 
         # go easy on the memory - only compute if really needed
         if (mod(t, print_iter) == 0 && verbose) ||
@@ -632,7 +638,7 @@ function stochastic_frank_wolfe(
     # last computation done with full evaluation for exact gradient
 
     (primal, gradient) = compute_value_gradient(f, x, full_evaluation=true)
-    v = compute_extreme_point(lmo, gradient, x=x, kwargs...)
+    v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
     # @show (gradient, primal)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
     if verbose
diff --git a/src/oracles.jl b/src/oracles.jl
index 75d94e73a..31fe7f03f 100644
--- a/src/oracles.jl
+++ b/src/oracles.jl
@@ -61,7 +61,7 @@ function compute_extreme_point(
             return v
         end
     end
-    v = compute_extreme_point(lmo.inner, direction, kwargs...)
+    v = compute_extreme_point(lmo.inner, direction; kwargs...)
     if store_cache
         lmo.last_vertex = v
     end
@@ -165,7 +165,7 @@ function compute_extreme_point(
     end
     # no interesting point found, computing new
     # println("LP sol")
-    v = compute_extreme_point(lmo.inner, direction, kwargs...)
+    v = compute_extreme_point(lmo.inner, direction; kwargs...)
     if store_cache
         tup = Base.setindex(lmo.vertices, v, lmo.oldest_idx)
         lmo.vertices = tup
@@ -308,64 +308,108 @@ end
     ChasingGradientLMO{LMO,T}
 
 Oracle wrapping another one of type LMO to boost FW with gradient pursuit.
-Pursuit rounds end once the alignment improvement with the direction of the gradient 
+Pursuit rounds end once the alignment improvement with the direction of the gradient
 is lower than `improv_tol` or `max_rounds_number` is reached.
 See the [paper](https://arxiv.org/abs/2003.06369)
 
 All keyword arguments are passed to the inner LMO.
 """
-struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T} <: LinearMinimizationOracle
+mutable struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T,G,G2} <: LinearMinimizationOracle
     inner::LMO
     max_rounds_number::Int
     improv_tol::T
+    d::G
+    u::G
+    residual::G2
+    m_residual::G2
+end
+
+function ChasingGradientLMO(lmo, max_rounds_number, improv_tol, d, residual)
+    return ChasingGradientLMO(
+        lmo,
+        max_rounds_number,
+        improv_tol,
+        d,
+        copy(d),
+        residual,
+        copy(residual),
+    )
+end
+
+function _zero!(d)
+    @. d = 0
+end
+
+function _zero!(d::SparseArrays.AbstractSparseArray)
+    @. d = 0
+    return SparseArrays.dropzeros!(d)
+end
+
+function _inplace_plus(d, v)
+    @. d += v
+end
+
+function _inplace_plus(d, v::ScaledHotVector)
+    return d[v.val_idx] += v.active_val
 end
 
 function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...)
-    d = 0 * direction
-    Λ = 0.0
-    if norm(direction) <= eps(eltype(direction))
-        return compute_extreme_point(lmo.inner, direction, x=x, kwargs...)
+    _zero!(lmo.d)
+    v_ret = 0 * lmo.d
+    Λ = zero(eltype(direction))
+    norm_direction = norm(direction)
+    if norm_direction <= eps(eltype(direction))
+        v_ret .+= compute_extreme_point(lmo.inner, direction, x=x, kwargs...)
+        return v_ret
     end
-    flag = true
-    function align(d1, d2)
-        if norm(d1) <= eps(eltype(d1)) || norm(d2) <= eps(eltype(d2))
-            return -1.0
+    use_v = true
+    function align(d)
+        if norm(d) <= eps(eltype(d))
+            return -one(eltype(d))
         else
-            return fast_dot(d1, d2) / (norm(d1) * norm(d2))
+            return -fast_dot(direction, d) / (norm_direction * norm(d))
         end
     end
-    residual = -direction - d
+    @. lmo.residual = -direction
+    @. lmo.m_residual = direction
     for round in 1:lmo.max_rounds_number
-        if norm(residual) <= eps(eltype(residual))
+        if norm(lmo.residual) <= eps(eltype(lmo.residual))
             #@show round
             break
         end
-        v = compute_extreme_point(lmo.inner, -residual; kwargs...)
-        u = v - x
-        d_norm = norm(d)
-        if d_norm > 0 && fast_dot(-residual, -d / d_norm) < fast_dot(-residual, u)
-            u = -d / d_norm
-            flag = false
+        v = compute_extreme_point(lmo.inner, lmo.m_residual; kwargs...)
+        #@. lmo.u = v - x
+        @. lmo.u = -x
+        _inplace_plus(lmo.u, v)
+        d_norm = norm(lmo.d)
+        if d_norm > 0 && fast_dot(lmo.residual, lmo.d / d_norm) < -fast_dot(lmo.residual, lmo.u)
+            @. lmo.u = -lmo.d / d_norm
+            use_v = false
         end
-        λ = fast_dot(residual, u) / (norm(u)^2)
-        d_updated = d + λ * u
-        if align(-direction, d_updated) - align(-direction, d) >= lmo.improv_tol
-            d = d_updated
-            residual = -direction - d
-            if flag
+        λ = fast_dot(lmo.residual, lmo.u) / (norm(lmo.u)^2)
+        d_updated = lmo.u
+        @. d_updated = lmo.d + λ * lmo.u #2 allocs
+        if align(d_updated) - align(lmo.d) >= lmo.improv_tol
+            @. lmo.d = d_updated
+            # @. lmo.residual = -direction - lmo.d
+            @. lmo.residual -= λ * lmo.u
+            @. lmo.m_residual = -lmo.residual
+            if use_v
                 Λ += λ
             else
                 Λ = Λ * (1 - λ / d_norm)
             end
-            flag = true
+            use_v = true
         else
             #@show round
             break
         end
     end
     if Λ <= eps(eltype(Λ))
-        return x
-    else
-        return x + d / Λ
+        v_ret .+= compute_extreme_point(lmo.inner, direction, x=x, kwargs...)
+        return v_ret
     end
+    @. v_ret += x
+    @. v_ret += lmo.d / Λ #2 allocations
+    return v_ret
 end
diff --git a/src/simplex_oracles.jl b/src/simplex_oracles.jl
index 34d5407cd..e1e96e411 100644
--- a/src/simplex_oracles.jl
+++ b/src/simplex_oracles.jl
@@ -21,7 +21,7 @@ LMO for scaled unit simplex:
 Returns either vector of zeros or vector with one active value equal to RHS if
 there exists an improving direction.
 """
-function compute_extreme_point(lmo::UnitSimplexOracle{T}, direction) where {T}
+function compute_extreme_point(lmo::UnitSimplexOracle{T}, direction; kwargs...) where {T}
     idx = argmin(direction)
     if direction[idx] < 0
         return ScaledHotVector(lmo.right_side, idx, length(direction))
diff --git a/test/lmo.jl b/test/lmo.jl
index d77ce6055..5492bb050 100644
--- a/test/lmo.jl
+++ b/test/lmo.jl
@@ -1,7 +1,7 @@
 using Test
 using FrankWolfe
 using LinearAlgebra
-import SparseArrays
+using SparseArrays
 
 import FrankWolfe: compute_extreme_point, LpNormLMO, KSparseLMO
 
@@ -519,7 +519,7 @@ end
 end
 
 @testset "Product LMO" begin
-    # 
+    #
     lmo = FrankWolfe.ProductLMO(FrankWolfe.LpNormLMO{Inf}(3.0), FrankWolfe.LpNormLMO{1}(2.0))
     dinf = randn(10)
     d1 = randn(5)
@@ -535,40 +535,50 @@ end
 
 
 @testset "Chasing Gradient LMO" begin
-    max_rounds = 1
-    improv_tol = 10e-3
-    n = Int(1e5)
-    xpi = rand(n)
-    total = sum(xpi)
-    xp = xpi ./ total
-    f(x) = norm(x - xp)^2
-    function grad!(storage, x)
-        @. storage = 2 * (x - xp)
-        return nothing
+    @testset "Consistent results with 1 round" begin
+        max_rounds = 1
+        improv_tol = 1e-3
+        n = Int(1e5)
+        xpi = rand(n)
+        total = sum(xpi)
+        xp = xpi ./ total
+        f(x) = norm(x - xp)^2
+        function grad!(storage, x)
+            @. storage = 2 * (x - xp)
+            return nothing
+        end
+        d = spzeros(n)
+        residual = zeros(n)
+        for lmo_inner in [
+            FrankWolfe.LpNormLMO{1}(1.0),
+            FrankWolfe.LpNormLMO{1}(1.0),
+            FrankWolfe.ProbabilitySimplexOracle(1.0),
+        ]
+            x00 = FrankWolfe.compute_extreme_point(lmo_inner, zeros(n))
+            lmo = FrankWolfe.ChasingGradientLMO(lmo_inner, max_rounds, improv_tol, d, residual)
+            x0 = sparse(deepcopy(x00))
+            res_boosting = FrankWolfe.frank_wolfe(
+                f,
+                grad!,
+                lmo,
+                x0,
+                max_iteration=500,
+                gradient=zeros(n),
+                line_search=FrankWolfe.Adaptive(),
+                verbose=true,
+            )
+            x0 = deepcopy(x00)
+            res = FrankWolfe.frank_wolfe(
+                f,
+                grad!,
+                lmo_inner,
+                x0,
+                max_iteration=500,
+                line_search=FrankWolfe.Adaptive(),
+                verbose=true,
+            )
+            @test isapprox(res_boosting[3], res[3])
+        end
     end
-    lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1.0)
-    x00 = FrankWolfe.compute_extreme_point(lmo_prob, zeros(n))
-    lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol)
-    x0 = deepcopy(x00)
-    res_boosting = FrankWolfe.frank_wolfe(
-        f,
-        grad!,
-        lmo,
-        x0,
-        max_iteration=500,
-        print_iter=50,
-        line_search=FrankWolfe.Adaptive(),
-        verbose=true,
-    )
-    x0 = deepcopy(x00)
-    res = FrankWolfe.frank_wolfe(
-        f,
-        grad!,
-        lmo_prob,
-        x0,
-        max_iteration=500,
-        print_iter=50,
-        line_search=FrankWolfe.Adaptive(),
-        verbose=true,
-    )
+
 end
diff --git a/test/temp.jl b/test/temp.jl
new file mode 100644
index 000000000..439d1a3a4
--- /dev/null
+++ b/test/temp.jl
@@ -0,0 +1,61 @@
+
+using Test
+using FrankWolfe
+using LinearAlgebra
+
+using SparseArrays
+
+using ProfileView
+using Random
+using BenchmarkTools
+import FrankWolfe: compute_extreme_point, LpNormLMO, KSparseLMO
+
+using Plots
+
+
+
+function test(lmo, direction, x0, xp)
+    for _ in 1:100
+        FrankWolfe.compute_extreme_point(lmo, direction, x=x0)
+    end
+    # res_boosting = FrankWolfe.frank_wolfe(
+    #     f,
+    #     grad!,
+    #     lmo,
+    #     x0,
+    #     gradient=zeros(n),
+    #     max_iteration=500,
+    #     print_iter=50,
+    #     line_search=FrankWolfe.Adaptive(),
+    #     verbose=true,
+    # )
+end
+
+
+function bench()
+    max_rounds = 10
+    improv_tol = 10e-3
+    for nb_dim in [Int(1e2), Int(1e3), Int(1e4), Int(1e5), Int(1e6)]
+        @show nb_dim
+        Random.seed!(1234)
+        xpi = rand(nb_dim)
+        total = sum(xpi)
+        xp = xpi ./ total
+        d = spzeros(nb_dim)
+        lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1.0)
+        x0 = sparse(FrankWolfe.compute_extreme_point(lmo_prob, zeros(nb_dim)))
+        residual = zeros(nb_dim)
+        direction = rand(nb_dim)
+        f(x) = norm(x - xp)^2
+        function grad!(storage, x)
+            @. storage = 2 * (x - xp)
+            return nothing
+        end
+        lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol, d, residual)
+        # @profview test(lmo, direction, x0,xp)
+        # @profview test(lmo, direction, x0,xp)
+        @btime test($lmo, $direction, $x0, $xp)
+    end
+end
+
+bench()

From abfd67b25dddfe958aa8c37c3160073e7930fc07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Mon, 19 Jul 2021 17:40:46 +0200
Subject: [PATCH 7/9] remove wrong import

---
 src/FrankWolfe.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/FrankWolfe.jl b/src/FrankWolfe.jl
index ccd0054b1..4c403870f 100644
--- a/src/FrankWolfe.jl
+++ b/src/FrankWolfe.jl
@@ -1,6 +1,5 @@
 module FrankWolfe
 
-using Plots: SparseArrays
 using LinearAlgebra
 using Printf
 using ProgressMeter

From 64201944099504c12e19441a037fa417a9d7f72d Mon Sep 17 00:00:00 2001
From: Matthieu <57533715+mattplo@users.noreply.github.com>
Date: Tue, 20 Jul 2021 14:52:39 +0200
Subject: [PATCH 8/9] fix remaining merge conflict

---
 src/fw_algorithms.jl | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index b61c989c8..d4740c8ec 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -30,12 +30,8 @@ function frank_wolfe(
     gradient=nothing,
     callback=nothing,
     timeout=Inf,
-<<<<<<< HEAD
-    print_callback=FrankWolfe.print_callback,
-    kwargs...,
-=======
     print_callback=print_callback,
->>>>>>> master
+    kwargs...,
 )
 
     # format string for output of the algorithm

From 1a1ad364c9b622381f9713107f28470aff1bc82d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Sat, 7 Aug 2021 18:39:50 +0200
Subject: [PATCH 9/9] fix conflict, minor format

---
 src/fw_algorithms.jl |  6 +-----
 src/oracles.jl       | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index b61c989c8..d4740c8ec 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -30,12 +30,8 @@ function frank_wolfe(
     gradient=nothing,
     callback=nothing,
     timeout=Inf,
-<<<<<<< HEAD
-    print_callback=FrankWolfe.print_callback,
-    kwargs...,
-=======
     print_callback=print_callback,
->>>>>>> master
+    kwargs...,
 )
 
     # format string for output of the algorithm
diff --git a/src/oracles.jl b/src/oracles.jl
index 31fe7f03f..58e59bc59 100644
--- a/src/oracles.jl
+++ b/src/oracles.jl
@@ -307,12 +307,15 @@ end
 """
     ChasingGradientLMO{LMO,T}
 
-Oracle wrapping another one of type LMO to boost FW with gradient pursuit.
-Pursuit rounds end once the alignment improvement with the direction of the gradient
+Oracle boosting FW with gradient pursuit: the LMO constructs a direction as
+a combination of multiple extreme vertices:
+`d_t = ∑_k λ_k (v_k - x_t)` found in successive rounds.
+Pursuit rounds terminate once the alignment improvement with the direction of the gradient
 is lower than `improv_tol` or `max_rounds_number` is reached.
-See the [paper](https://arxiv.org/abs/2003.06369)
 
-All keyword arguments are passed to the inner LMO.
+The LMO also keeps internal containers `d`, `u`, `residual` and `m_residual` to reduce allocations.
+See the [paper](https://arxiv.org/abs/2003.06369).
+All keyword arguments to `compute_extreme_point` are passed to the inner LMO.
 """
 mutable struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T,G,G2} <: LinearMinimizationOracle
     inner::LMO
@@ -338,11 +341,13 @@ end
 
 function _zero!(d)
     @. d = 0
+    return nothing
 end
 
 function _zero!(d::SparseArrays.AbstractSparseArray)
     @. d = 0
-    return SparseArrays.dropzeros!(d)
+    SparseArrays.dropzeros!(d)
+    return nothing
 end
 
 function _inplace_plus(d, v)
@@ -374,11 +379,9 @@ function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...)
     @. lmo.m_residual = direction
     for round in 1:lmo.max_rounds_number
         if norm(lmo.residual) <= eps(eltype(lmo.residual))
-            #@show round
             break
         end
         v = compute_extreme_point(lmo.inner, lmo.m_residual; kwargs...)
-        #@. lmo.u = v - x
         @. lmo.u = -x
         _inplace_plus(lmo.u, v)
         d_norm = norm(lmo.d)