From cc53d1b10b9be4ac453568b188d18a113e75f168 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Mon, 31 Jul 2023 15:28:15 +0200
Subject: [PATCH 01/16] weak separation oracle

---
 src/abstract_oracles.jl | 102 ++++++++++++++++++++++++++++++++++++++--
 src/fw_algorithms.jl    |  21 ++++++---
 2 files changed, 111 insertions(+), 12 deletions(-)

diff --git a/src/abstract_oracles.jl b/src/abstract_oracles.jl
index 2fe2f9e9e..3a5af8786 100644
--- a/src/abstract_oracles.jl
+++ b/src/abstract_oracles.jl
@@ -17,6 +17,29 @@ All LMOs should accept keyword arguments that they can ignore.
 """
 function compute_extreme_point end
 
+"""
+    compute_weak_separation_point(lmo, direction, max_value) -> (vertex, gap)
+
+Weak separation algorithm for a given oracle.
+Unlike `compute_extreme_point`, `compute_weak_separation_point` may provide a suboptimal `vertex` with the following conditions:
+- `vertex` is still a valid extreme point of the polytope.
+IF an inexact vertex is computed:
+- `⟨v, d⟩ ≤ max_value`, the pre-specified required improvement.
+- `⟨v, d⟩ ≤ ⟨v_opt, d⟩ + gap`, with `v_opt` a vertex computed with the exact oracle.
+- If the algorithm used to compute the inexact vertex provides a bound on optimality, the `gap` value must be valid.
+- Otherwise, e.g. if the vertex is computed with a heuristic, `gap = ∞`.
+ELSE (the oracle computes an optimal vertex):
+- `⟨v, d⟩` may be greater than `max_value`, `gap` must be 0.
+"""
+function compute_weak_separation_point(lmo, direction, max_value; kwargs...) end
+
+# default to computing an exact vertex.
+function compute_weak_separation_point(lmo::LinearMinimizationOracle, direction, max_value; kwargs...)
+    v = compute_extreme_point(lmo, direction; kwargs...)
+    gap = zero(eltype(v)) * zero(eltype(direction))
+    return v, gap
+end
+
 """
     CachedLinearMinimizationOracle{LMO}
 
@@ -43,11 +66,29 @@ Vertices of `LMO` have to be of type `VT` if provided.
 mutable struct SingleLastCachedLMO{LMO,A} <: CachedLinearMinimizationOracle{LMO}
     last_vertex::Union{Nothing,A}
     inner::LMO
+    store_cache::Bool
 end
 
 # initializes with no cache by default
 SingleLastCachedLMO(lmo::LMO) where {LMO<:LinearMinimizationOracle} =
-    SingleLastCachedLMO{LMO,AbstractVector}(nothing, lmo)
+    SingleLastCachedLMO{LMO,AbstractVector}(nothing, lmo, true)
+
+# gap is 0 if exact, ∞ if cached point
+function compute_weak_separation_point(lmo::SingleLastCachedLMO, direction, max_value; kwargs...)
+    if lmo.last_vertex !== nothing && isfinite(max_value)
+        # cache is a sufficiently-decreasing direction
+        if fast_dot(lmo.last_vertex, direction) ≤ max_value
+            T = promote_type(eltype(lmo.last_vertex), eltype(direction))
+            return lmo.last_vertex, T(Inf)
+        end
+    end
+    v = compute_extreme_point(lmo.inner, direction, kwargs...)
+    if lmo.store_cache
+        lmo.last_vertex = v
+    end
+    T = promote_type(eltype(v), eltype(direction))
+    return (v, zero(T))
+end
 
 function compute_extreme_point(
     lmo::SingleLastCachedLMO,
@@ -62,7 +103,7 @@ function compute_extreme_point(
             return lmo.last_vertex
         end
     end
-    v = compute_extreme_point(lmo.inner, direction, kwargs...)
+    v = compute_extreme_point(lmo.inner, direction, v=v, kwargs...)
     if store_cache
         lmo.last_vertex = v
     end
@@ -188,14 +229,16 @@ mutable struct VectorCacheLMO{LMO<:LinearMinimizationOracle,VT} <:
                CachedLinearMinimizationOracle{LMO}
     vertices::Vector{VT}
     inner::LMO
+    store_cache::Bool
+    greedy::Bool
 end
 
 function VectorCacheLMO{LMO,VT}(lmo::LMO) where {VT,LMO<:LinearMinimizationOracle}
-    return VectorCacheLMO{LMO,VT}(VT[], lmo)
+    return VectorCacheLMO{LMO,VT}(VT[], lmo, true, false)
 end
 
 function VectorCacheLMO(lmo::LMO) where {LMO<:LinearMinimizationOracle}
-    return VectorCacheLMO{LMO,Vector{Float64}}(AbstractVector[], lmo)
+    return VectorCacheLMO{LMO,Vector{Float64}}(AbstractVector[], lmo, true, false)
 end
 
 function Base.empty!(lmo::VectorCacheLMO)
@@ -205,6 +248,55 @@ end
 
 Base.length(lmo::VectorCacheLMO) = length(lmo.vertices)
 
+function compute_weak_separation_point(lmo::VectorCacheLMO, direction, max_value; kwargs...)
+    if isempty(lmo.vertices)
+        v = compute_extreme_point(lmo.inner, direction)
+        T = promote_type(eltype(v), eltype(direction))
+        if lmo.store_cache
+            push!(lmo.vertices, v)
+        end
+        return v, zero(T)
+    end
+    best_idx = -1
+    best_val = Inf
+    best_v = nothing
+    for idx in reverse(eachindex(lmo.vertices))
+        @inbounds v = lmo.vertices[idx]
+        new_val = fast_dot(v, direction)
+        if new_val ≤ max_value
+            T = promote_type(eltype(v), eltype(direction))
+            # stop and return
+            if lmo.greedy
+                return v, T(Inf)
+            end
+            # otherwise, compare to incumbent
+            if new_val < best_val
+                best_v = v
+                best_val = new_val
+                best_idx = idx
+            end
+        end
+    end
+    if best_idx > 0
+        T = promote_type(eltype(best_v), eltype(direction))
+        return best_v, T(Inf)
+    end
+    # no satisfactory vertex found
+    v = compute_extreme_point(lmo.inner, direction)
+    if lmo.store_cache
+        # note: we do not check for duplicates. hence you might end up with more vertices,
+        # in fact up to number of dual steps many, that might be already in the cache
+        # in order to reach this point, if v was already in the cache is must not meet the threshold (otherwise we would have returned it)
+        # and it is the best possible, hence we will perform a dual step on the outside.
+        #
+        # note: another possibility could be to test against that in the if statement but then you might end you recalculating the same vertex a few times.
+        # as such this might be a better tradeoff, i.e., to not check the set for duplicates and potentially accept #dual_steps many duplicates.
+        push!(lmo.vertices, v)
+    end
+    T = promote_type(eltype(v), eltype(direction))
+    return v, zero(v)
+end
+
 function compute_extreme_point(
     lmo::VectorCacheLMO,
     direction;
@@ -228,7 +320,7 @@ function compute_extreme_point(
         @inbounds v = lmo.vertices[idx]
         new_val = fast_dot(v, direction)
         if new_val ≤ threshold
-            # stop, store and return
+            # stop and return
             if greedy
                 return v
             end
diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index eff4bdb57..9280b37af 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -293,6 +293,7 @@ function lazified_conditional_gradient(
     end
 
     lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base)
+    lmo.greedy = greedy_lazy
     if isfinite(cache_size)
         Base.sizehint!(lmo.vertices, cache_size)
     end
@@ -300,7 +301,7 @@ function lazified_conditional_gradient(
     t = 0
     dual_gap = Inf
     primal = Inf
-    v = []
+    v = x0
     x = x0
     phi = Inf
     tt = regular
@@ -353,7 +354,7 @@ function lazified_conditional_gradient(
     while t <= max_iteration && dual_gap >= max(epsilon, eps(float(eltype(x))))
 
         #####################
-        # managing time and Ctrl-C
+        # managing time
         #####################
         time_at_loop = time_ns()
         if t == 0
@@ -382,12 +383,18 @@ function lazified_conditional_gradient(
             primal = f(x)
         end
 
-        v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy)
-        tt = lazy
-        if fast_dot(v, gradient) > threshold
-            tt = dualstep
+        v, gap = compute_weak_separation_point(lmo, gradient, threshold)
+        if gap > 0
+            tt = lazy
+            if fast_dot(v, gradient) > threshold
+                tt = dualstep
+                dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+                phi = min(dual_gap, phi / 2)
+            end
+        else
+            tt = regular
             dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-            phi = min(dual_gap, phi / 2)
+            phi = dual_gap
         end
 
         d = muladd_memory_mode(memory_mode, d, x, v)

From 6f3ed3fae0eb0af4146088be9a52b6a2e5c01197 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Mon, 31 Jul 2023 15:36:30 +0200
Subject: [PATCH 02/16] return type

---
 src/abstract_oracles.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/abstract_oracles.jl b/src/abstract_oracles.jl
index 3a5af8786..ad92a5f12 100644
--- a/src/abstract_oracles.jl
+++ b/src/abstract_oracles.jl
@@ -87,7 +87,7 @@ function compute_weak_separation_point(lmo::SingleLastCachedLMO, direction, max_
         lmo.last_vertex = v
     end
     T = promote_type(eltype(v), eltype(direction))
-    return (v, zero(T))
+    return v, zero(T)
 end
 
 function compute_extreme_point(
@@ -294,7 +294,7 @@ function compute_weak_separation_point(lmo::VectorCacheLMO, direction, max_value
         push!(lmo.vertices, v)
     end
     T = promote_type(eltype(v), eltype(direction))
-    return v, zero(v)
+    return v, zero(T)
 end
 
 function compute_extreme_point(

From 500deb631903a0ac67dbf8577ebf5f43f6253bf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Mon, 31 Jul 2023 18:21:22 +0200
Subject: [PATCH 03/16] useless duplicate

---
 src/moi_oracle.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/moi_oracle.jl b/src/moi_oracle.jl
index 875b8001d..18014a225 100644
--- a/src/moi_oracle.jl
+++ b/src/moi_oracle.jl
@@ -128,7 +128,6 @@ function _optimize_and_return(lmo, variables)
     term_st = MOI.get(lmo.o, MOI.TerminationStatus())
     if term_st ∉ (MOI.OPTIMAL, MOI.ALMOST_OPTIMAL, MOI.SLOW_PROGRESS)
         @error "Unexpected termination: $term_st"
-        return MOI.get.(lmo.o, MOI.VariablePrimal(), variables)
     end
     return MOI.get.(lmo.o, MOI.VariablePrimal(), variables)
 end

From d3a012486844fa4c3c53d41f8a12feb2610061ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Thu, 17 Aug 2023 14:49:21 +0200
Subject: [PATCH 04/16] weak separation in AFW

---
 src/afw.jl              | 57 +++++++++++++++++++--------
 src/defs.jl             |  2 +
 src/tracking.jl         |  7 +++-
 test/weak_separation.jl | 85 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 135 insertions(+), 16 deletions(-)
 create mode 100644 test/weak_separation.jl

diff --git a/src/afw.jl b/src/afw.jl
index 851aec274..59fe67982 100644
--- a/src/afw.jl
+++ b/src/afw.jl
@@ -33,6 +33,7 @@ function away_frank_wolfe(
     use_extra_vertex_storage=false,
     linesearch_workspace=nothing,
     recompute_last_vertex=true,
+    weak_separation=false,
 )
     # add the first vertex to active set from initialization
     active_set = ActiveSet([(1.0, x0)])
@@ -64,6 +65,7 @@ function away_frank_wolfe(
         use_extra_vertex_storage=use_extra_vertex_storage,
         linesearch_workspace=linesearch_workspace,
         recompute_last_vertex=recompute_last_vertex,
+        weak_separation=weak_separation,
     )
 end
 
@@ -95,6 +97,7 @@ function away_frank_wolfe(
     use_extra_vertex_storage=false,
     linesearch_workspace=nothing,
     recompute_last_vertex=true,
+    weak_separation=false,
 )
     # format string for output of the algorithm
     format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n"
@@ -153,7 +156,7 @@ function away_frank_wolfe(
         )
         grad_type = typeof(gradient)
         println(
-            "GRADIENTTYPE: $grad_type LAZY: $lazy lazy_tolerance: $lazy_tolerance MOMENTUM: $momentum AWAYSTEPS: $away_steps",
+            "GRADIENT TYPE: $grad_type LAZY: $lazy LAZY_TOLERANCE: $lazy_tolerance WEAK_SEPARATION: $weak_separation MOMENTUM: $momentum AWAYSTEPS: $away_steps",
         )
         println("Linear Minimization Oracle: $(typeof(lmo))")
         if (use_extra_vertex_storage || add_dropped_vertices) && extra_vertex_storage === nothing
@@ -223,10 +226,16 @@ function away_frank_wolfe(
                         extra_vertex_storage=extra_vertex_storage,
                         lazy_tolerance=lazy_tolerance,
                         memory_mode=memory_mode,
+                        weak_separation=weak_separation,
                     )
             else
                 d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
-                    afw_step(x, gradient, lmo, active_set, epsilon, d, memory_mode=memory_mode)
+                    afw_step(
+                        x, gradient, lmo, active_set, epsilon, d,
+                        memory_mode=memory_mode,
+                        weak_separation=weak_separation,
+                        lazy_tolerance=lazy_tolerance,
+                    )
             end
         else
             d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
@@ -248,7 +257,7 @@ function away_frank_wolfe(
                 memory_mode,
             )
             
-            gamma = min(gamma_max, gamma)  
+            gamma = min(gamma_max, gamma)
             # cleanup and renormalize every x iterations. Only for the fw steps.
             renorm = mod(t, renorm_interval) == 0
             if away_step_taken
@@ -368,9 +377,9 @@ function away_frank_wolfe(
     return x, v, primal, dual_gap, traj_data, active_set
 end
 
-function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_vertex_storage=false, extra_vertex_storage=nothing, lazy_tolerance=2.0, memory_mode::MemoryEmphasis=InplaceEmphasis())
+function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_vertex_storage=false, extra_vertex_storage=nothing, lazy_tolerance=2.0, memory_mode::MemoryEmphasis=InplaceEmphasis(), weak_separation::Bool=true)
     _, v, v_loc, _, a_lambda, a, a_loc, _, _ = active_set_argminmax(active_set, gradient)
-    #Do lazy FW step
+    # do lazy FW step
     grad_dot_lazy_fw_vertex = fast_dot(v, gradient)
     grad_dot_x = fast_dot(x, gradient)
     grad_dot_a = fast_dot(a, gradient)
@@ -385,7 +394,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_
         fw_step_taken = true
         index = v_loc
     else
-        #Do away step, as it promises enough progress.
+        # do away step, as it promises enough progress.
         if grad_dot_a - grad_dot_x > grad_dot_x - grad_dot_lazy_fw_vertex &&
            grad_dot_a - grad_dot_x >= phi / lazy_tolerance
             tt = away
@@ -395,7 +404,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_
             away_step_taken = true
             fw_step_taken = false
             index = a_loc
-            #Resort to calling the LMO
+            # resort to calling the LMO
         else
             # optionally: try vertex storage
             if use_extra_vertex_storage
@@ -406,13 +415,20 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_
                     @debug("Found acceptable lazy vertex in storage")
                     v = new_forward_vertex
                     tt = lazylazy
+                end
+            else
+                found_better_vertex = false
+            end
+            if !found_better_vertex
+                # compute new vertex with normal or weak oracle
+                if weak_separation
+                    lazy_threshold = fast_dot(gradient, x) - phi / lazy_tolerance
+                    (v, _) = compute_weak_separation_point(lmo, gradient, lazy_threshold)
+                    tt = weaksep
                 else
                     v = compute_extreme_point(lmo, gradient)
                     tt = regular
                 end
-            else
-                v = compute_extreme_point(lmo, gradient)
-                tt = regular
             end
             # Real dual gap promises enough progress.
             grad_dot_fw_vertex = fast_dot(v, gradient)
@@ -439,14 +455,25 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_
     return d, vertex, index, gamma_max, phi, away_step_taken, fw_step_taken, tt
 end
 
-function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryEmphasis=InplaceEmphasis())
+function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryEmphasis=InplaceEmphasis(), weak_separation::Bool=false, lazy_tolerance=2.0)
     _, _, _, _, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient)
-    v = compute_extreme_point(lmo, gradient)
     grad_dot_x = fast_dot(x, gradient)
     away_gap = fast_dot(a, gradient) - grad_dot_x
-    dual_gap = grad_dot_x - fast_dot(v, gradient)
-    if dual_gap >= away_gap && dual_gap >= epsilon
-        tt = regular
+    (v, gap) = if weak_separation
+        # Condition for taking a FW step
+        # ⟨∇f, x-v⟩ ≥ gₐ
+        # ⟨∇f, v⟩ ≤ ⟨∇f, x⟩ - gₐ
+        # We ask for a bit more on the FW step
+        # to promote away steps when we can (and therefore sparsity)
+        # ⟨∇f, v⟩ ≤ ⟨∇f, x⟩ - K gₐ
+        lazy_threshold = grad_dot_x - lazy_tolerance * away_gap
+        compute_weak_separation_point(lmo, gradient, lazy_threshold)
+    else
+        (compute_extreme_point(lmo, gradient), 0.0)
+    end
+    dual_gap = grad_dot_x - fast_dot(v, gradient) + gap
+    if dual_gap > away_gap && dual_gap >= epsilon
+        tt = gap == 0.0 ? regular : weaksep
         gamma_max = one(a_lambda)
         d = muladd_memory_mode(memory_mode, d, x, v)
         vertex = v
diff --git a/src/defs.jl b/src/defs.jl
index db3c8c0f1..c251b8181 100644
--- a/src/defs.jl
+++ b/src/defs.jl
@@ -19,6 +19,7 @@ struct OutplaceEmphasis <: MemoryEmphasis end
     away = 6
     pairwise = 7
     drop = 8
+    weaksep = 9
     simplex_descent = 101
     gap_step = 102
     last = 1000
@@ -34,6 +35,7 @@ const st = (
     away="A",
     pairwise="P",
     drop="D",
+    weaksep="W",
     simplex_descent="SD",
     gap_step="GS",
     last="Last",
diff --git a/src/tracking.jl b/src/tracking.jl
index 949806fbf..205043cc1 100644
--- a/src/tracking.jl
+++ b/src/tracking.jl
@@ -55,7 +55,12 @@ end
 
 function compute_extreme_point(lmo::TrackingLMO, x; kwargs...)
     lmo.counter += 1
-    return compute_extreme_point(lmo.lmo, x)
+    return compute_extreme_point(lmo.lmo, x; kwargs...)
+end
+
+function compute_weak_separation_point(lmo::TrackingLMO, direction, max_value)
+    lmo.counter += 1
+    return compute_weak_separation_point(lmo.lmo, direction, max_value)
 end
 
 is_tracking_lmo(lmo) = false
diff --git a/test/weak_separation.jl b/test/weak_separation.jl
new file mode 100644
index 000000000..9894cc402
--- /dev/null
+++ b/test/weak_separation.jl
@@ -0,0 +1,85 @@
+using FrankWolfe
+using LinearAlgebra
+using Test
+
+"""
+A test LMO representing the feasible set [0,1]^n
+"""
+struct Hypercube <: FrankWolfe.LinearMinimizationOracle
+end
+
+function FrankWolfe.compute_extreme_point(::Hypercube, direction::AbstractArray{T}) where {T}
+    v = similar(direction, T)
+    v .= 0
+    for idx in eachindex(v)
+        if direction[idx] < 0
+            v[idx] = 1
+        end
+    end
+    return v
+end
+
+# a very naive implementation of a weak separation oracle without a gap (a heuristic)
+# we stop iterating through the indices whenever we obtain sufficient progress
+function FrankWolfe.compute_weak_separation_point(::Hypercube, direction::AbstractArray{T}, max_value) where {T}
+    v = similar(direction, T)
+    v .= 0
+    res = zero(T)
+    gap = zero(T)
+    for idx in eachindex(v)
+        if direction[idx] < 0
+            v[idx] = 1
+            res += direction[idx]
+        end
+        if res <= max_value && idx != lastindex(v)
+            gap = T(Inf)
+            break
+            @info "avoided $(length(v) - idx) iterations"
+        end
+    end
+    return v, gap
+end
+
+@testset "Basic behaviour" begin
+    lmo = Hypercube()
+    direction = rand(4,2)
+    v = FrankWolfe.compute_extreme_point(lmo, direction)
+    @test size(v) == size(direction)
+    @test norm(v) == 0
+    (w, _) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1)
+    @test norm(w) == 0
+    direction .*= -1
+    direction[end] -= 1
+    v = FrankWolfe.compute_extreme_point(lmo, direction)
+    @test v == ones(size(direction))
+    (w, gap0) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1)
+    @test dot(w, direction) <= -1
+    @test gap0 == Inf
+    (w, gap1) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1.5)
+    @test dot(w, direction) <= -1.5
+    @test gap1 == Inf
+    # asking for too much progress results in exact LMO call
+    (w, gap3) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1000)
+    @test gap3 == 0.0
+    @test w == v
+end
+
+@testset "AFW with weak separation" begin
+    n = 1000
+    # reference point to get an optimum on a face
+    ref_point = [0.6 + mod(idx, 2) for idx in 1:n]
+    f(x) = 1/2 / n * sum((x[i] - ref_point[i])^2 for i in eachindex(x))
+    function grad!(storage, x)
+        storage .= x
+        storage .-= ref_point
+        storage ./= n
+    end
+    lmo = Hypercube()
+    x0 = FrankWolfe.compute_extreme_point(lmo, -ones(n))
+    tracking_lmo = FrankWolfe.TrackingLMO(lmo)
+    for lazy in (false, true)
+        x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.away_frank_wolfe(f, grad!, tracking_lmo, x0, verbose=true, weak_separation=false, lazy=lazy)
+        tracking_weak = FrankWolfe.TrackingLMO(lmo)
+        x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=true, weak_separation=true, lazy=lazy)
+    end
+end

From 0617339a348c71e7e16ae4fecbde5ac051e22afd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Thu, 17 Aug 2023 15:01:53 +0200
Subject: [PATCH 05/16] tests

---
 docs/src/reference/2_lmo.md | 9 ++++++++-
 test/runtests.jl            | 6 ++++++
 test/weak_separation.jl     | 7 ++++---
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/docs/src/reference/2_lmo.md b/docs/src/reference/2_lmo.md
index b75d936dd..72459b9a2 100644
--- a/docs/src/reference/2_lmo.md
+++ b/docs/src/reference/2_lmo.md
@@ -5,7 +5,7 @@ The Linear Minimization Oracle (LMO) is a key component called at each iteration
 v\in \argmin_{x\in \mathcal{C}} \langle d,x \rangle.
 ```
 
-See [Combettes, Pokutta 2021](https://arxiv.org/abs/2101.10040) for references on most LMOs
+See [Combettes, Pokutta 2021](https://arxiv.org/abs/2101.10040) for references on essential LMOs
 implemented in the package and their comparison with projection operators.
 
 ## Interface and wrappers
@@ -19,6 +19,13 @@ All of them are subtypes of [`FrankWolfe.LinearMinimizationOracle`](@ref) and im
 compute_extreme_point
 ```
 
+Optionally, an LMO can implement a weak separation procedure based either on a heuristic or on an approximation algorithm:
+```@docs
+compute_weak_separation_point
+```
+
+Weak separation procedures will be used in the methods using an active set and lazified variants only.
+
 We also provide some meta-LMOs wrapping another one with extended behavior:
 ```@docs
 FrankWolfe.CachedLinearMinimizationOracle
diff --git a/test/runtests.jl b/test/runtests.jl
index 1d0c9c4c9..d33b7378c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -785,3 +785,9 @@ include("generic-arrays.jl")
         end
     end
 end
+
+module WeakSeparation
+@testset "weak separation oracles" begin
+    include("weak_separation.jl")
+end
+end
diff --git a/test/weak_separation.jl b/test/weak_separation.jl
index 9894cc402..553084536 100644
--- a/test/weak_separation.jl
+++ b/test/weak_separation.jl
@@ -76,10 +76,11 @@ end
     end
     lmo = Hypercube()
     x0 = FrankWolfe.compute_extreme_point(lmo, -ones(n))
-    tracking_lmo = FrankWolfe.TrackingLMO(lmo)
     for lazy in (false, true)
-        x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.away_frank_wolfe(f, grad!, tracking_lmo, x0, verbose=true, weak_separation=false, lazy=lazy)
+        tracking_lmo = FrankWolfe.TrackingLMO(lmo)
+        x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.away_frank_wolfe(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy)
         tracking_weak = FrankWolfe.TrackingLMO(lmo)
-        x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=true, weak_separation=true, lazy=lazy)
+        x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy)
+        @test tracking_lmo.counter <= tracking_weak.counter
     end
 end

From 4172c516c8b663454cbfd576bcfe58016b565346 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Thu, 17 Aug 2023 17:46:28 +0200
Subject: [PATCH 06/16] include module

---
 test/runtests.jl        |  1 +
 test/weak_separation.jl | 22 ++++++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index d33b7378c..fa9c9b29e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -787,6 +787,7 @@ include("generic-arrays.jl")
 end
 
 module WeakSeparation
+using Test
 @testset "weak separation oracles" begin
     include("weak_separation.jl")
 end
diff --git a/test/weak_separation.jl b/test/weak_separation.jl
index 553084536..9789ca671 100644
--- a/test/weak_separation.jl
+++ b/test/weak_separation.jl
@@ -3,12 +3,16 @@ using LinearAlgebra
 using Test
 
 """
-A test LMO representing the feasible set [0,1]^n
+A test LMO representing the feasible set [0,1]^n.
+`nops` tracks the number of operations performed.
 """
-struct Hypercube <: FrankWolfe.LinearMinimizationOracle
+mutable struct Hypercube <: FrankWolfe.LinearMinimizationOracle
+    nops::Int
 end
 
-function FrankWolfe.compute_extreme_point(::Hypercube, direction::AbstractArray{T}) where {T}
+Hypercube() = Hypercube(0)
+
+function FrankWolfe.compute_extreme_point(lmo::Hypercube, direction::AbstractArray{T}) where {T}
     v = similar(direction, T)
     v .= 0
     for idx in eachindex(v)
@@ -16,6 +20,7 @@ function FrankWolfe.compute_extreme_point(::Hypercube, direction::AbstractArray{
             v[idx] = 1
         end
     end
+    lmo.nops += length(v)
     return v
 end
 
@@ -26,7 +31,9 @@ function FrankWolfe.compute_weak_separation_point(::Hypercube, direction::Abstra
     v .= 0
     res = zero(T)
     gap = zero(T)
+    nops = 0
     for idx in eachindex(v)
+        nops += 1
         if direction[idx] < 0
             v[idx] = 1
             res += direction[idx]
@@ -34,9 +41,9 @@ function FrankWolfe.compute_weak_separation_point(::Hypercube, direction::Abstra
         if res <= max_value && idx != lastindex(v)
             gap = T(Inf)
             break
-            @info "avoided $(length(v) - idx) iterations"
         end
     end
+    lmo.nops += nops
     return v, gap
 end
 
@@ -74,12 +81,11 @@ end
         storage .-= ref_point
         storage ./= n
     end
-    lmo = Hypercube()
-    x0 = FrankWolfe.compute_extreme_point(lmo, -ones(n))
+    x0 = FrankWolfe.compute_extreme_point(Hypercube(), -ones(n))
     for lazy in (false, true)
-        tracking_lmo = FrankWolfe.TrackingLMO(lmo)
+        tracking_lmo = FrankWolfe.TrackingLMO(Hypercube())
         x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.away_frank_wolfe(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy)
-        tracking_weak = FrankWolfe.TrackingLMO(lmo)
+        tracking_weak = FrankWolfe.TrackingLMO(Hypercube())
         x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy)
         @test tracking_lmo.counter <= tracking_weak.counter
     end

From 2102dfad12050b7b865090288770b5a84131e71d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Thu, 17 Aug 2023 18:24:38 +0200
Subject: [PATCH 07/16] include lmo arg

---
 test/weak_separation.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/weak_separation.jl b/test/weak_separation.jl
index 9789ca671..d21dd49f2 100644
--- a/test/weak_separation.jl
+++ b/test/weak_separation.jl
@@ -26,7 +26,7 @@ end
 
 # a very naive implementation of a weak separation oracle without a gap (a heuristic)
 # we stop iterating through the indices whenever we obtain sufficient progress
-function FrankWolfe.compute_weak_separation_point(::Hypercube, direction::AbstractArray{T}, max_value) where {T}
+function FrankWolfe.compute_weak_separation_point(lmo::Hypercube, direction::AbstractArray{T}, max_value) where {T}
     v = similar(direction, T)
     v .= 0
     res = zero(T)

From b2a2d9b41382498a04de714190f6d87d281b3ca5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Mon, 21 Aug 2023 11:12:59 +0200
Subject: [PATCH 08/16] started pairwise

---
 src/pairwise.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/pairwise.jl b/src/pairwise.jl
index 05c393deb..4f0a568f7 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -30,6 +30,7 @@ function blended_pairwise_conditional_gradient(
     add_dropped_vertices=false,
     use_extra_vertex_storage=false,
     recompute_last_vertex=true,
+    weak_separation=false,
 )
     # add the first vertex to active set from initialization
     active_set = ActiveSet([(1.0, x0)])
@@ -58,6 +59,7 @@ function blended_pairwise_conditional_gradient(
         add_dropped_vertices=add_dropped_vertices,
         use_extra_vertex_storage=use_extra_vertex_storage,
         recompute_last_vertex=recompute_last_vertex,
+        weak_separation=weak_separation,
     )
 end
 
@@ -90,6 +92,7 @@ function blended_pairwise_conditional_gradient(
     add_dropped_vertices=false,
     use_extra_vertex_storage=false,
     recompute_last_vertex=true,
+    weak_separation=false,
 )
 
     # format string for output of the algorithm
@@ -137,7 +140,7 @@ function blended_pairwise_conditional_gradient(
             "MEMORY_MODE: $memory_mode STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $NumType",
         )
         grad_type = typeof(gradient)
-        println("GRADIENTTYPE: $grad_type LAZY: $lazy lazy_tolerance: $lazy_tolerance")
+        println("GRADIENTTYPE: $grad_type LAZY: $lazy lazy_tolerance: $lazy_tolerance WEAK_SEPARATION: $weak_separation")
         println("Linear Minimization Oracle: $(typeof(lmo))")
         if use_extra_vertex_storage && !lazy
             @info("vertex storage only used in lazy mode")

From 2deff33db41ee1daf92ccd6b84df4eeea4fe9c76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Thu, 31 Aug 2023 16:46:15 +0200
Subject: [PATCH 09/16] continue weak

---
 src/afw.jl      |  6 +++---
 src/pairwise.jl | 29 +++++++++++++++++++++--------
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/afw.jl b/src/afw.jl
index 59fe67982..68bddb1fe 100644
--- a/src/afw.jl
+++ b/src/afw.jl
@@ -440,8 +440,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_
                 away_step_taken = false
                 fw_step_taken = true
                 index = -1
-                #Lower our expectation for progress.
-            else
+            else # lower our expectation for progress.
                 tt = dualstep
                 phi = min(dual_gap, phi / 2.0)
                 gamma_max = zero(a_lambda)
@@ -471,7 +470,7 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE
     else
         (compute_extreme_point(lmo, gradient), 0.0)
     end
-    dual_gap = grad_dot_x - fast_dot(v, gradient) + gap
+    dual_gap = grad_dot_x - fast_dot(v, gradient) - gap
     if dual_gap > away_gap && dual_gap >= epsilon
         tt = gap == 0.0 ? regular : weaksep
         gamma_max = one(a_lambda)
@@ -489,6 +488,7 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE
         fw_step_taken = false
         index = a_loc
     else
+        @assert gap == 0.0
         tt = away
         gamma_max = zero(a_lambda)
         vertex = a
diff --git a/src/pairwise.jl b/src/pairwise.jl
index 4f0a568f7..fd9a5b089 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -277,27 +277,39 @@ function blended_pairwise_conditional_gradient(
                         v = new_forward_vertex
                         tt = lazylazy
                     else
-                        v = compute_extreme_point(lmo, gradient)
-                        tt = regular
+                        (v, gap) = if weak_separation
+                            compute_weak_separation_point(lmo, gradient, lazy_threshold)
+                        else
+                            (compute_extreme_point(lmo, gradient), 0.0)
+                        end
+                        tt = gap == 0.0 ? regular : weaksep
                     end
                 else
                     # for t == 1, v is already computed before first iteration
-                    if t > 1
-                        v = compute_extreme_point(lmo, gradient)
+                    if t == 1
+                        gap = 0.0
+                    else
+                        (v, gap) = if weak_separation
+                            lazy_threshold = fast_dot(gradient, x) - phi / lazy_tolerance
+                            (v, gap) = compute_weak_separation_point(lmo, gradient, lazy_threshold)
+                        else
+                            v = compute_extreme_point(lmo, gradient)
+                            gap = 0.0
+                        end
                     end
-                    tt = regular
+                    tt = gap == 0.0 ? regular : weaksep
                 end
             end
             vertex_taken = v
             dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v)
             # if we are about to exit, compute dual_gap with the cleaned-up x
-            if dual_gap ≤ epsilon
+            if dual_gap + gap ≤ epsilon
                 active_set_renormalize!(active_set)
                 active_set_cleanup!(active_set)
                 compute_active_set_iterate!(active_set)
                 x = get_active_set_iterate(active_set)
                 grad!(gradient, x)
-                dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v)
+                dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v) + gap
             end
             # Note: In the following, we differentiate between lazy and non-lazy updates.
             # The reason is that the non-lazy version does not use phi but the lazy one heavily depends on it.
@@ -365,7 +377,8 @@ function blended_pairwise_conditional_gradient(
                 # that is ok as we scale with the K = 2.0 default anyways
                 # we only update the dual gap if the step was regular (not lazy from discarded set)
                 if tt != lazylazy
-                    phi = dual_gap
+                    @assert dual_gap + gap < phi
+                    phi = dual_gap + gap
                     @debug begin
                         @assert tt == regular
                         v2 = compute_extreme_point(lmo, gradient)

From 05ebc9db68ba4f220025e4705c178edc963a7468 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Thu, 31 Aug 2023 18:10:18 +0200
Subject: [PATCH 10/16] gap not needed for stopping

---
 src/pairwise.jl | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/pairwise.jl b/src/pairwise.jl
index fd9a5b089..8cda89d7e 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -291,10 +291,11 @@ function blended_pairwise_conditional_gradient(
                     else
                         (v, gap) = if weak_separation
                             lazy_threshold = fast_dot(gradient, x) - phi / lazy_tolerance
-                            (v, gap) = compute_weak_separation_point(lmo, gradient, lazy_threshold)
+                            compute_weak_separation_point(lmo, gradient, lazy_threshold)
                         else
                             v = compute_extreme_point(lmo, gradient)
                             gap = 0.0
+                            (v, gap)
                         end
                     end
                     tt = gap == 0.0 ? regular : weaksep
@@ -303,13 +304,13 @@ function blended_pairwise_conditional_gradient(
             vertex_taken = v
             dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v)
             # if we are about to exit, compute dual_gap with the cleaned-up x
-            if dual_gap + gap ≤ epsilon
+            if dual_gap ≤ epsilon
                 active_set_renormalize!(active_set)
                 active_set_cleanup!(active_set)
                 compute_active_set_iterate!(active_set)
                 x = get_active_set_iterate(active_set)
                 grad!(gradient, x)
-                dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v) + gap
+                dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v)
             end
             # Note: In the following, we differentiate between lazy and non-lazy updates.
             # The reason is that the non-lazy version does not use phi but the lazy one heavily depends on it.
@@ -320,7 +321,7 @@ function blended_pairwise_conditional_gradient(
             # - for lazy: we also accept slightly weaker vertices, those satisfying phi / lazy_tolerance
             # this should simplify the criterion.
             # DO NOT CHANGE without good reason and talk to Sebastian first for the logic behind this.
-            if !lazy || dual_gap ≥ phi / lazy_tolerance                  
+            if !lazy || dual_gap ≥ phi / lazy_tolerance
                 d = muladd_memory_mode(memory_mode, d, x, v)
 
                 gamma = perform_line_search(

From 192286bfaf59c750fb1e54c6667815189219b14b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Thu, 31 Aug 2023 23:22:13 +0200
Subject: [PATCH 11/16] fix afw

---
 src/afw.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/afw.jl b/src/afw.jl
index 68bddb1fe..eecc59be4 100644
--- a/src/afw.jl
+++ b/src/afw.jl
@@ -470,7 +470,7 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE
     else
         (compute_extreme_point(lmo, gradient), 0.0)
     end
-    dual_gap = grad_dot_x - fast_dot(v, gradient) - gap
+    dual_gap = grad_dot_x - fast_dot(v, gradient)
     if dual_gap > away_gap && dual_gap >= epsilon
         tt = gap == 0.0 ? regular : weaksep
         gamma_max = one(a_lambda)
@@ -488,7 +488,6 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE
         fw_step_taken = false
         index = a_loc
     else
-        @assert gap == 0.0
         tt = away
         gamma_max = zero(a_lambda)
         vertex = a
@@ -496,7 +495,7 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE
         fw_step_taken = false
         index = a_loc
     end
-    return d, vertex, index, gamma_max, dual_gap, away_step_taken, fw_step_taken, tt
+    return d, vertex, index, gamma_max, dual_gap + gap, away_step_taken, fw_step_taken, tt
 end
 
 function fw_step(x, gradient, lmo, d; memory_mode::MemoryEmphasis = InplaceEmphasis())

From 71910a41245feda09c3f8e1cf090b39fb4f4c145 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Mon, 4 Sep 2023 13:01:48 +0200
Subject: [PATCH 12/16] finished implementation

---
 src/afw.jl              | 14 ++++++++------
 src/pairwise.jl         |  2 +-
 test/weak_separation.jl |  9 ++++++++-
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/afw.jl b/src/afw.jl
index eecc59be4..bb635c7a3 100644
--- a/src/afw.jl
+++ b/src/afw.jl
@@ -423,16 +423,17 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_
                 # compute new vertex with normal or weak oracle
                 if weak_separation
                     lazy_threshold = fast_dot(gradient, x) - phi / lazy_tolerance
-                    (v, _) = compute_weak_separation_point(lmo, gradient, lazy_threshold)
-                    tt = weaksep
+                    (v, gap) = compute_weak_separation_point(lmo, gradient, lazy_threshold)
+                    tt = gap == 0.0 ? regular : weaksep
                 else
                     v = compute_extreme_point(lmo, gradient)
+                    gap = zero(eltype(v))
                     tt = regular
                 end
             end
-            # Real dual gap promises enough progress.
             grad_dot_fw_vertex = fast_dot(v, gradient)
             dual_gap = grad_dot_x - grad_dot_fw_vertex
+            # Real dual gap promises enough progress.
             if dual_gap >= phi / lazy_tolerance
                 gamma_max = one(a_lambda)
                 d = muladd_memory_mode(memory_mode, d, x, v)
@@ -441,6 +442,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_
                 fw_step_taken = true
                 index = -1
             else # lower our expectation for progress.
+                @assert tt != weaksep
                 tt = dualstep
                 phi = min(dual_gap, phi / 2.0)
                 gamma_max = zero(a_lambda)
@@ -460,15 +462,15 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE
     away_gap = fast_dot(a, gradient) - grad_dot_x
     (v, gap) = if weak_separation
         # Condition for taking a FW step
-        # ⟨∇f, x-v⟩ ≥ gₐ
+        # ⟨∇f, x-v⟩ ≥ gₐ  <=>
         # ⟨∇f, v⟩ ≤ ⟨∇f, x⟩ - gₐ
-        # We ask for a bit more on the FW step
+        # We ask for a bit more progress on the FW step
         # to promote away steps when we can (and therefore sparsity)
         # ⟨∇f, v⟩ ≤ ⟨∇f, x⟩ - K gₐ
         lazy_threshold = grad_dot_x - lazy_tolerance * away_gap
         compute_weak_separation_point(lmo, gradient, lazy_threshold)
     else
-        (compute_extreme_point(lmo, gradient), 0.0)
+        (compute_extreme_point(lmo, gradient), zero(away_gap))
     end
     dual_gap = grad_dot_x - fast_dot(v, gradient)
     if dual_gap > away_gap && dual_gap >= epsilon
diff --git a/src/pairwise.jl b/src/pairwise.jl
index 8cda89d7e..1b9d0a9f9 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -376,7 +376,7 @@ function blended_pairwise_conditional_gradient(
             else # dual step
                 # set to computed dual_gap for consistency between the lazy and non-lazy run.
                 # that is ok as we scale with the K = 2.0 default anyways
-                # we only update the dual gap if the step was regular (not lazy from discarded set)
+                # we only update the dual gap if the step was regular or weaksep (not lazy from discarded set)
                 if tt != lazylazy
                     @assert dual_gap + gap < phi
                     phi = dual_gap + gap
diff --git a/test/weak_separation.jl b/test/weak_separation.jl
index d21dd49f2..02f324b18 100644
--- a/test/weak_separation.jl
+++ b/test/weak_separation.jl
@@ -71,7 +71,7 @@ end
     @test w == v
 end
 
-@testset "AFW with weak separation" begin
+@testset "AFW and BPCG with weak separation" begin
     n = 1000
     # reference point to get an optimum on a face
     ref_point = [0.6 + mod(idx, 2) for idx in 1:n]
@@ -88,5 +88,12 @@ end
         tracking_weak = FrankWolfe.TrackingLMO(Hypercube())
         x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy)
         @test tracking_lmo.counter <= tracking_weak.counter
+
+        tracking_lmo = FrankWolfe.TrackingLMO(Hypercube())
+        x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy)
+        tracking_weak = FrankWolfe.TrackingLMO(Hypercube())
+        x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy)
+        @test tracking_lmo.counter <= tracking_weak.counter
+
     end
 end

From b94e95dca2473956fc33dea172f47925a507d7e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Tue, 5 Sep 2023 14:03:42 +0200
Subject: [PATCH 13/16] non-lazy BPCG

---
 src/pairwise.jl | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/pairwise.jl b/src/pairwise.jl
index 1b9d0a9f9..71cebc739 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -204,14 +204,24 @@ function blended_pairwise_conditional_gradient(
         local_gap = dot_away_vertex - dot_forward_vertex
         if !lazy
             if t > 1
-                v = compute_extreme_point(lmo, gradient)
-                dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v)
-                phi = dual_gap
+                dot_x = fast_dot(gradient, x)
+                (v, weak_gap) = if weak_separation
+                    # we need a separation point v
+                    # ⟨∇f(x), x-v⟩ ≥ local_gap * lazy_threshold
+                    # ⟨∇f(x), v⟩ ≤ ⟨∇f(x), x⟩ - local_gap * lazy_threshold
+                    threshold = dot_x - local_gap * lazy_threshold
+                    compute_weak_separation_point(lmo, gradient, threshold)
+                else
+                    v = compute_extreme_point(lmo, gradient)
+                    (v, zero(phi))
+                end
+                dual_gap = dot_x - fast_dot(gradient, v)
+                phi = dual_gap + weak_gap
             end
         end
         # minor modification from original paper for improved sparsity
         # (proof follows with minor modification when estimating the step)
-        if local_gap ≥ phi / lazy_tolerance
+        if local_gap ≥ phi / lazy_tolerance # pairwise step
             d = muladd_memory_mode(memory_mode, d, a, v_local)
             vertex_taken = v_local
             gamma_max = a_lambda

From dbb6622bd55c81f68d2b1d0bccad5ac1d355f2ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Tue, 5 Sep 2023 14:18:22 +0200
Subject: [PATCH 14/16] var name

---
 src/pairwise.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pairwise.jl b/src/pairwise.jl
index 71cebc739..4660477cb 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -207,9 +207,9 @@ function blended_pairwise_conditional_gradient(
                 dot_x = fast_dot(gradient, x)
                 (v, weak_gap) = if weak_separation
                     # we need a separation point v
-                    # ⟨∇f(x), x-v⟩ ≥ local_gap * lazy_threshold
-                    # ⟨∇f(x), v⟩ ≤ ⟨∇f(x), x⟩ - local_gap * lazy_threshold
-                    threshold = dot_x - local_gap * lazy_threshold
+                    # ⟨∇f(x), x-v⟩ ≥ local_gap * lazy_tolerance
+                    # ⟨∇f(x), v⟩ ≤ ⟨∇f(x), x⟩ - local_gap * lazy_tolerance
+                    threshold = dot_x - local_gap * lazy_tolerance
                     compute_weak_separation_point(lmo, gradient, threshold)
                 else
                     v = compute_extreme_point(lmo, gradient)

From cb729b3a2ecd0b1629d955af7bbb6d229184b9b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Tue, 5 Sep 2023 16:04:31 +0200
Subject: [PATCH 15/16] higher cardinality optimum

---
 test/weak_separation.jl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/weak_separation.jl b/test/weak_separation.jl
index 02f324b18..41f97becc 100644
--- a/test/weak_separation.jl
+++ b/test/weak_separation.jl
@@ -73,8 +73,8 @@ end
 
 @testset "AFW and BPCG with weak separation" begin
     n = 1000
-    # reference point to get an optimum on a face
-    ref_point = [0.6 + mod(idx, 2) for idx in 1:n]
+    # reference point to get an optimum with many vertices
+    ref_point = [0.4 + idx / n * mod(idx, 2) for idx in 1:n]
     f(x) = 1/2 / n * sum((x[i] - ref_point[i])^2 for i in eachindex(x))
     function grad!(storage, x)
         storage .= x
@@ -87,13 +87,12 @@ end
         x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.away_frank_wolfe(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy)
         tracking_weak = FrankWolfe.TrackingLMO(Hypercube())
         x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy)
-        @test tracking_lmo.counter <= tracking_weak.counter
+        @test tracking_lmo.counter < tracking_weak.counter
 
         tracking_lmo = FrankWolfe.TrackingLMO(Hypercube())
         x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy)
         tracking_weak = FrankWolfe.TrackingLMO(Hypercube())
         x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy)
-        @test tracking_lmo.counter <= tracking_weak.counter
-
+        @test tracking_lmo.counter < tracking_weak.counter
     end
 end

From fb6773abfc048f2c727582e87c09ac604749363a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Sun, 10 Sep 2023 19:35:57 +0200
Subject: [PATCH 16/16] weak inner in lazified FW

---
 src/abstract_oracles.jl | 25 ++++++++++++++++++-------
 src/fw_algorithms.jl    | 23 ++++++++++++-----------
 test/weak_separation.jl | 19 +++++++++++++++++++
 3 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/src/abstract_oracles.jl b/src/abstract_oracles.jl
index ad92a5f12..de7f187c9 100644
--- a/src/abstract_oracles.jl
+++ b/src/abstract_oracles.jl
@@ -231,14 +231,15 @@ mutable struct VectorCacheLMO{LMO<:LinearMinimizationOracle,VT} <:
     inner::LMO
     store_cache::Bool
     greedy::Bool
+    weak_separation::Bool
 end
 
 function VectorCacheLMO{LMO,VT}(lmo::LMO) where {VT,LMO<:LinearMinimizationOracle}
-    return VectorCacheLMO{LMO,VT}(VT[], lmo, true, false)
+    return VectorCacheLMO{LMO,VT}(VT[], lmo, true, false, false)
 end
 
 function VectorCacheLMO(lmo::LMO) where {LMO<:LinearMinimizationOracle}
-    return VectorCacheLMO{LMO,Vector{Float64}}(AbstractVector[], lmo, true, false)
+    return VectorCacheLMO{LMO,Vector{Float64}}(AbstractVector[], lmo, true, false, false)
 end
 
 function Base.empty!(lmo::VectorCacheLMO)
@@ -250,12 +251,17 @@ Base.length(lmo::VectorCacheLMO) = length(lmo.vertices)
 
 function compute_weak_separation_point(lmo::VectorCacheLMO, direction, max_value; kwargs...)
     if isempty(lmo.vertices)
-        v = compute_extreme_point(lmo.inner, direction)
+        v, gap = if lmo.weak_separation
+            compute_weak_separation_point(lmo.inner, direction, max_value; kwargs...)
+        else
+            v = compute_extreme_point(lmo.inner, direction; kwargs...)
+            v, zero(eltype(v))
+        end
         T = promote_type(eltype(v), eltype(direction))
         if lmo.store_cache
             push!(lmo.vertices, v)
         end
-        return v, zero(T)
+        return v, T(gap)
     end
     best_idx = -1
     best_val = Inf
@@ -281,8 +287,13 @@ function compute_weak_separation_point(lmo::VectorCacheLMO, direction, max_value
         T = promote_type(eltype(best_v), eltype(direction))
         return best_v, T(Inf)
     end
-    # no satisfactory vertex found
-    v = compute_extreme_point(lmo.inner, direction)
+    # no satisfactory vertex found, call oracle
+    v, gap = if lmo.weak_separation
+        compute_weak_separation_point(lmo.inner, direction, max_value; kwargs...)
+    else
+        v = compute_extreme_point(lmo.inner, direction; kwargs...)
+        v, zero(eltype(v))
+    end
     if lmo.store_cache
         # note: we do not check for duplicates. hence you might end up with more vertices,
         # in fact up to number of dual steps many, that might be already in the cache
@@ -294,7 +305,7 @@ function compute_weak_separation_point(lmo::VectorCacheLMO, direction, max_value
         push!(lmo.vertices, v)
     end
     T = promote_type(eltype(v), eltype(direction))
-    return v, zero(T)
+    return v, T(gap)
 end
 
 function compute_extreme_point(
diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index 4102cd2d8..814c44473 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -273,6 +273,7 @@ function lazified_conditional_gradient(
     VType=typeof(x0),
     timeout=Inf,
     linesearch_workspace=nothing,
+    weak_separation=false,
 )
 
     # format string for output of the algorithm
@@ -297,6 +298,7 @@ function lazified_conditional_gradient(
     if isfinite(cache_size)
         Base.sizehint!(lmo.vertices, cache_size)
     end
+    lmo.weak_separation = weak_separation
 
     t = 0
     dual_gap = Inf
@@ -351,7 +353,7 @@ function lazified_conditional_gradient(
         linesearch_workspace = build_linesearch_workspace(line_search, x, gradient)
     end
 
-    while t <= max_iteration && dual_gap >= max(epsilon, eps(float(eltype(x))))
+    while t <= max_iteration && phi >= max(epsilon, eps(float(eltype(x))))
 
         #####################
         # managing time
@@ -383,18 +385,17 @@ function lazified_conditional_gradient(
             primal = f(x)
         end
 
-        v, gap = compute_weak_separation_point(lmo, gradient, threshold)
-        if gap > 0
-            tt = lazy
-            if fast_dot(v, gradient) > threshold
-                tt = dualstep
-                dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-                phi = min(dual_gap, phi / 2)
-            end
+        v, weak_gap = compute_weak_separation_point(lmo, gradient, threshold)
+        tt = lazy
+        if fast_dot(v, gradient) > threshold
+            tt = dualstep
+            dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+            @assert weak_gap == 0
+            phi = min(dual_gap, phi / 2)
         else
-            tt = regular
+            tt = weak_gap > 0 ? weaksep : regular
             dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-            phi = dual_gap
+            phi = min(phi, dual_gap + weak_gap)
         end
 
         d = muladd_memory_mode(memory_mode, d, x, v)
diff --git a/test/weak_separation.jl b/test/weak_separation.jl
index 41f97becc..b7c214f9d 100644
--- a/test/weak_separation.jl
+++ b/test/weak_separation.jl
@@ -96,3 +96,22 @@ end
         @test tracking_lmo.counter < tracking_weak.counter
     end
 end
+
+@testset "Lazy-FW with weak separation" begin
+    n = 1000
+    # reference point to get an optimum with many vertices
+    ref_point = [0.4 + idx / n * mod(idx, 2) for idx in 1:n]
+    f(x) = 1/2 / n * sum((x[i] - ref_point[i])^2 for i in eachindex(x))
+    function grad!(storage, x)
+        storage .= x
+        storage .-= ref_point
+        storage ./= n
+    end
+    x0 = FrankWolfe.compute_extreme_point(Hypercube(), -ones(n))
+    tracking_lmo = FrankWolfe.TrackingLMO(Hypercube())
+    x, v, primal, dual_gap, trajectory_exact = FrankWolfe.lazified_conditional_gradient(f, grad!, tracking_lmo, x0, verbose=true, weak_separation=false)
+    x0 = FrankWolfe.compute_extreme_point(Hypercube(), -ones(n))
+    tracking_weak = FrankWolfe.TrackingLMO(Hypercube())
+    x, v, primal, dual_gap, trajectory_weak = FrankWolfe.lazified_conditional_gradient(f, grad!, tracking_weak, x0, verbose=true, weak_separation=true)
+    @test tracking_lmo.counter < tracking_weak.counter
+end