diff --git a/docs/src/reference/2_lmo.md b/docs/src/reference/2_lmo.md
index b75d936dd..72459b9a2 100644
--- a/docs/src/reference/2_lmo.md
+++ b/docs/src/reference/2_lmo.md
@@ -5,7 +5,7 @@ The Linear Minimization Oracle (LMO) is a key component called at each iteration
 v\in \argmin_{x\in \mathcal{C}} \langle d,x \rangle.
 ```
 
-See [Combettes, Pokutta 2021](https://arxiv.org/abs/2101.10040) for references on most LMOs
+See [Combettes, Pokutta 2021](https://arxiv.org/abs/2101.10040) for references on essential LMOs
 implemented in the package and their comparison with projection operators.
 
 ## Interface and wrappers
@@ -19,6 +19,13 @@ All of them are subtypes of [`FrankWolfe.LinearMinimizationOracle`](@ref) and im
 compute_extreme_point
 ```
 
+Optionally, an LMO can implement a weak separation procedure based either on a heuristic or on an approximation algorithm:
+```@docs
+compute_weak_separation_point
+```
+
+Weak separation procedures will be used in the methods using an active set and lazified variants only.
+
 We also provide some meta-LMOs wrapping another one with extended behavior:
 ```@docs
 FrankWolfe.CachedLinearMinimizationOracle
diff --git a/src/abstract_oracles.jl b/src/abstract_oracles.jl
index 2fe2f9e9e..de7f187c9 100644
--- a/src/abstract_oracles.jl
+++ b/src/abstract_oracles.jl
@@ -17,6 +17,29 @@ All LMOs should accept keyword arguments that they can ignore.
 """
 function compute_extreme_point end
 
+"""
+    compute_weak_separation_point(lmo, direction, max_value) -> (vertex, gap)
+
+Weak separation algorithm for a given oracle.
+Unlike `compute_extreme_point`, `compute_weak_separation_point` may provide a suboptimal `vertex` with the following conditions:
+- `vertex` is still a valid extreme point of the polytope.
+IF an inexact vertex is computed:
+- `⟨v, d⟩ ≤ max_value`, the pre-specified required improvement.
+- `⟨v, d⟩ ≤ ⟨v_opt, d⟩ + gap`, with `v_opt` a vertex computed with the exact oracle.
+- If the algorithm used to compute the inexact vertex provides a bound on optimality, the `gap` value must be valid.
+- Otherwise, e.g. if the vertex is computed with a heuristic, `gap = ∞`.
+ELSE (the oracle computes an optimal vertex):
+- `⟨v, d⟩` may be greater than `max_value`, `gap` must be 0.
+"""
+function compute_weak_separation_point(lmo, direction, max_value; kwargs...) end
+
+# default to computing an exact vertex.
+function compute_weak_separation_point(lmo::LinearMinimizationOracle, direction, max_value; kwargs...)
+    v = compute_extreme_point(lmo, direction; kwargs...)
+    gap = zero(eltype(v)) * zero(eltype(direction))
+    return v, gap
+end
+
 """
     CachedLinearMinimizationOracle{LMO}
 
@@ -43,11 +66,29 @@ Vertices of `LMO` have to be of type `VT` if provided.
 mutable struct SingleLastCachedLMO{LMO,A} <: CachedLinearMinimizationOracle{LMO}
     last_vertex::Union{Nothing,A}
     inner::LMO
+    store_cache::Bool
 end
 
 # initializes with no cache by default
 SingleLastCachedLMO(lmo::LMO) where {LMO<:LinearMinimizationOracle} =
-    SingleLastCachedLMO{LMO,AbstractVector}(nothing, lmo)
+    SingleLastCachedLMO{LMO,AbstractVector}(nothing, lmo, true)
+
+# gap is 0 if exact, ∞ if cached point
+function compute_weak_separation_point(lmo::SingleLastCachedLMO, direction, max_value; kwargs...)
+    if lmo.last_vertex !== nothing && isfinite(max_value)
+        # cache is a sufficiently-decreasing direction
+        if fast_dot(lmo.last_vertex, direction) ≤ max_value
+            T = promote_type(eltype(lmo.last_vertex), eltype(direction))
+            return lmo.last_vertex, T(Inf)
+        end
+    end
+    v = compute_extreme_point(lmo.inner, direction, kwargs...)
+    if lmo.store_cache
+        lmo.last_vertex = v
+    end
+    T = promote_type(eltype(v), eltype(direction))
+    return v, zero(T)
+end
 
 function compute_extreme_point(
     lmo::SingleLastCachedLMO,
@@ -62,7 +103,7 @@ function compute_extreme_point(
             return lmo.last_vertex
         end
     end
-    v = compute_extreme_point(lmo.inner, direction, kwargs...)
+    v = compute_extreme_point(lmo.inner, direction, v=v, kwargs...)
     if store_cache
         lmo.last_vertex = v
     end
@@ -188,14 +229,17 @@ mutable struct VectorCacheLMO{LMO<:LinearMinimizationOracle,VT} <:
                CachedLinearMinimizationOracle{LMO}
     vertices::Vector{VT}
     inner::LMO
+    store_cache::Bool
+    greedy::Bool
+    weak_separation::Bool
 end
 
 function VectorCacheLMO{LMO,VT}(lmo::LMO) where {VT,LMO<:LinearMinimizationOracle}
-    return VectorCacheLMO{LMO,VT}(VT[], lmo)
+    return VectorCacheLMO{LMO,VT}(VT[], lmo, true, false, false)
 end
 
 function VectorCacheLMO(lmo::LMO) where {LMO<:LinearMinimizationOracle}
-    return VectorCacheLMO{LMO,Vector{Float64}}(AbstractVector[], lmo)
+    return VectorCacheLMO{LMO,Vector{Float64}}(AbstractVector[], lmo, true, false, false)
 end
 
 function Base.empty!(lmo::VectorCacheLMO)
@@ -205,6 +249,65 @@ end
 
 Base.length(lmo::VectorCacheLMO) = length(lmo.vertices)
 
+function compute_weak_separation_point(lmo::VectorCacheLMO, direction, max_value; kwargs...)
+    if isempty(lmo.vertices)
+        v, gap = if lmo.weak_separation
+            compute_weak_separation_point(lmo.inner, direction, max_value; kwargs...)
+        else
+            v = compute_extreme_point(lmo.inner, direction; kwargs...)
+            v, zero(eltype(v))
+        end
+        T = promote_type(eltype(v), eltype(direction))
+        if lmo.store_cache
+            push!(lmo.vertices, v)
+        end
+        return v, T(gap)
+    end
+    best_idx = -1
+    best_val = Inf
+    best_v = nothing
+    for idx in reverse(eachindex(lmo.vertices))
+        @inbounds v = lmo.vertices[idx]
+        new_val = fast_dot(v, direction)
+        if new_val ≤ max_value
+            T = promote_type(eltype(v), eltype(direction))
+            # stop and return
+            if lmo.greedy
+                return v, T(Inf)
+            end
+            # otherwise, compare to incumbent
+            if new_val < best_val
+                best_v = v
+                best_val = new_val
+                best_idx = idx
+            end
+        end
+    end
+    if best_idx > 0
+        T = promote_type(eltype(best_v), eltype(direction))
+        return best_v, T(Inf)
+    end
+    # no satisfactory vertex found, call oracle
+    v, gap = if lmo.weak_separation
+        compute_weak_separation_point(lmo.inner, direction, max_value; kwargs...)
+    else
+        v = compute_extreme_point(lmo.inner, direction; kwargs...)
+        v, zero(eltype(v))
+    end
+    if lmo.store_cache
+        # note: we do not check for duplicates. hence you might end up with more vertices,
+        # in fact up to number of dual steps many, that might be already in the cache
+        # in order to reach this point, if v was already in the cache is must not meet the threshold (otherwise we would have returned it)
+        # and it is the best possible, hence we will perform a dual step on the outside.
+        #
+        # note: another possibility could be to test against that in the if statement but then you might end you recalculating the same vertex a few times.
+        # as such this might be a better tradeoff, i.e., to not check the set for duplicates and potentially accept #dual_steps many duplicates.
+        push!(lmo.vertices, v)
+    end
+    T = promote_type(eltype(v), eltype(direction))
+    return v, T(gap)
+end
+
 function compute_extreme_point(
     lmo::VectorCacheLMO,
     direction;
@@ -228,7 +331,7 @@ function compute_extreme_point(
         @inbounds v = lmo.vertices[idx]
         new_val = fast_dot(v, direction)
         if new_val ≤ threshold
-            # stop, store and return
+            # stop and return
             if greedy
                 return v
             end
diff --git a/src/afw.jl b/src/afw.jl
index ee96824f5..e43c16e2c 100644
--- a/src/afw.jl
+++ b/src/afw.jl
@@ -33,6 +33,7 @@ function away_frank_wolfe(
     use_extra_vertex_storage=false,
     linesearch_workspace=nothing,
     recompute_last_vertex=true,
+    weak_separation=false,
 )
     # add the first vertex to active set from initialization
     active_set = ActiveSet([(1.0, x0)])
@@ -64,6 +65,7 @@ function away_frank_wolfe(
         use_extra_vertex_storage=use_extra_vertex_storage,
         linesearch_workspace=linesearch_workspace,
         recompute_last_vertex=recompute_last_vertex,
+        weak_separation=weak_separation,
     )
 end
 
@@ -95,6 +97,7 @@ function away_frank_wolfe(
     use_extra_vertex_storage=false,
     linesearch_workspace=nothing,
     recompute_last_vertex=true,
+    weak_separation=false,
 )
     # format string for output of the algorithm
     format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n"
@@ -153,7 +156,7 @@ function away_frank_wolfe(
         )
         grad_type = typeof(gradient)
         println(
-            "GRADIENTTYPE: $grad_type LAZY: $lazy lazy_tolerance: $lazy_tolerance MOMENTUM: $momentum AWAYSTEPS: $away_steps",
+            "GRADIENT TYPE: $grad_type LAZY: $lazy LAZY_TOLERANCE: $lazy_tolerance WEAK_SEPARATION: $weak_separation MOMENTUM: $momentum AWAYSTEPS: $away_steps",
         )
         println("Linear Minimization Oracle: $(typeof(lmo))")
         if (use_extra_vertex_storage || add_dropped_vertices) && extra_vertex_storage === nothing
@@ -223,10 +226,16 @@ function away_frank_wolfe(
                         extra_vertex_storage=extra_vertex_storage,
                         lazy_tolerance=lazy_tolerance,
                         memory_mode=memory_mode,
+                        weak_separation=weak_separation,
                     )
             else
                 d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
-                    afw_step(x, gradient, lmo, active_set, epsilon, d, memory_mode=memory_mode)
+                    afw_step(
+                        x, gradient, lmo, active_set, epsilon, d,
+                        memory_mode=memory_mode,
+                        weak_separation=weak_separation,
+                        lazy_tolerance=lazy_tolerance,
+                    )
             end
         else
             d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
@@ -248,7 +257,7 @@ function away_frank_wolfe(
                 memory_mode,
             )
             
-            gamma = min(gamma_max, gamma)  
+            gamma = min(gamma_max, gamma)
             # cleanup and renormalize every x iterations. Only for the fw steps.
             renorm = mod(t, renorm_interval) == 0
             if away_step_taken
@@ -368,9 +377,9 @@ function away_frank_wolfe(
     return x, v, primal, dual_gap, traj_data, active_set
 end
 
-function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_vertex_storage=false, extra_vertex_storage=nothing, lazy_tolerance=2.0, memory_mode::MemoryEmphasis=InplaceEmphasis())
+function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_vertex_storage=false, extra_vertex_storage=nothing, lazy_tolerance=2.0, memory_mode::MemoryEmphasis=InplaceEmphasis(), weak_separation::Bool=true)
     _, v, v_loc, _, a_lambda, a, a_loc, _, _ = active_set_argminmax(active_set, gradient)
-    #Do lazy FW step
+    # do lazy FW step
     grad_dot_lazy_fw_vertex = fast_dot(v, gradient)
     grad_dot_x = fast_dot(x, gradient)
     grad_dot_a = fast_dot(a, gradient)
@@ -385,7 +394,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_
         fw_step_taken = true
         index = v_loc
     else
-        #Do away step, as it promises enough progress.
+        # do away step, as it promises enough progress.
         if grad_dot_a - grad_dot_x > grad_dot_x - grad_dot_lazy_fw_vertex &&
            grad_dot_a - grad_dot_x >= phi / lazy_tolerance
             tt = away
@@ -395,7 +404,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_
             away_step_taken = true
             fw_step_taken = false
             index = a_loc
-            #Resort to calling the LMO
+            # resort to calling the LMO
         else
             # optionally: try vertex storage
             if use_extra_vertex_storage
@@ -406,17 +415,25 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_
                     @debug("Found acceptable lazy vertex in storage")
                     v = new_forward_vertex
                     tt = lazylazy
+                end
+            else
+                found_better_vertex = false
+            end
+            if !found_better_vertex
+                # compute new vertex with normal or weak oracle
+                if weak_separation
+                    lazy_threshold = fast_dot(gradient, x) - phi / lazy_tolerance
+                    (v, gap) = compute_weak_separation_point(lmo, gradient, lazy_threshold)
+                    tt = gap == 0.0 ? regular : weaksep
                 else
                     v = compute_extreme_point(lmo, gradient)
+                    gap = zero(eltype(v))
                     tt = regular
                 end
-            else
-                v = compute_extreme_point(lmo, gradient)
-                tt = regular
             end
-            # Real dual gap promises enough progress.
             grad_dot_fw_vertex = fast_dot(v, gradient)
             dual_gap = grad_dot_x - grad_dot_fw_vertex
+            # Real dual gap promises enough progress.
             if dual_gap >= phi / lazy_tolerance
                 gamma_max = one(a_lambda)
                 d = muladd_memory_mode(memory_mode, d, x, v)
@@ -424,8 +441,8 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_
                 away_step_taken = false
                 fw_step_taken = true
                 index = -1
-                #Lower our expectation for progress.
-            else
+            else # lower our expectation for progress.
+                @assert tt != weaksep
                 tt = dualstep
                 phi = min(dual_gap, phi / 2.0)
                 gamma_max = zero(a_lambda)
@@ -439,14 +456,25 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_
     return d, vertex, index, gamma_max, phi, away_step_taken, fw_step_taken, tt
 end
 
-function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryEmphasis=InplaceEmphasis())
+function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryEmphasis=InplaceEmphasis(), weak_separation::Bool=false, lazy_tolerance=2.0)
     _, _, _, _, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient)
-    v = compute_extreme_point(lmo, gradient)
     grad_dot_x = fast_dot(x, gradient)
     away_gap = fast_dot(a, gradient) - grad_dot_x
+    (v, gap) = if weak_separation
+        # Condition for taking a FW step
+        # ⟨∇f, x-v⟩ ≥ gₐ  <=>
+        # ⟨∇f, v⟩ ≤ ⟨∇f, x⟩ - gₐ
+        # We ask for a bit more progress on the FW step
+        # to promote away steps when we can (and therefore sparsity)
+        # ⟨∇f, v⟩ ≤ ⟨∇f, x⟩ - K gₐ
+        lazy_threshold = grad_dot_x - lazy_tolerance * away_gap
+        compute_weak_separation_point(lmo, gradient, lazy_threshold)
+    else
+        (compute_extreme_point(lmo, gradient), zero(away_gap))
+    end
     dual_gap = grad_dot_x - fast_dot(v, gradient)
-    if dual_gap >= away_gap && dual_gap >= epsilon
-        tt = regular
+    if dual_gap > away_gap && dual_gap >= epsilon
+        tt = gap == 0.0 ? regular : weaksep
         gamma_max = one(a_lambda)
         d = muladd_memory_mode(memory_mode, d, x, v)
         vertex = v
@@ -469,7 +497,7 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE
         fw_step_taken = false
         index = a_loc
     end
-    return d, vertex, index, gamma_max, dual_gap, away_step_taken, fw_step_taken, tt
+    return d, vertex, index, gamma_max, dual_gap + gap, away_step_taken, fw_step_taken, tt
 end
 
 function fw_step(x, gradient, lmo, d; memory_mode::MemoryEmphasis = InplaceEmphasis())
diff --git a/src/defs.jl b/src/defs.jl
index db3c8c0f1..c251b8181 100644
--- a/src/defs.jl
+++ b/src/defs.jl
@@ -19,6 +19,7 @@ struct OutplaceEmphasis <: MemoryEmphasis end
     away = 6
     pairwise = 7
     drop = 8
+    weaksep = 9
     simplex_descent = 101
     gap_step = 102
     last = 1000
@@ -34,6 +35,7 @@ const st = (
     away="A",
     pairwise="P",
     drop="D",
+    weaksep="W",
     simplex_descent="SD",
     gap_step="GS",
     last="Last",
diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index daddb6145..814c44473 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -273,6 +273,7 @@ function lazified_conditional_gradient(
     VType=typeof(x0),
     timeout=Inf,
     linesearch_workspace=nothing,
+    weak_separation=false,
 )
 
     # format string for output of the algorithm
@@ -293,14 +294,16 @@ function lazified_conditional_gradient(
     end
 
     lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base)
+    lmo.greedy = greedy_lazy
     if isfinite(cache_size)
         Base.sizehint!(lmo.vertices, cache_size)
     end
+    lmo.weak_separation = weak_separation
 
     t = 0
     dual_gap = Inf
     primal = Inf
-    v = []
+    v = x0
     x = x0
     phi = Inf
     tt = regular
@@ -350,10 +353,10 @@ function lazified_conditional_gradient(
         linesearch_workspace = build_linesearch_workspace(line_search, x, gradient)
     end
 
-    while t <= max_iteration && dual_gap >= max(epsilon, eps(float(eltype(x))))
+    while t <= max_iteration && phi >= max(epsilon, eps(float(eltype(x))))
 
         #####################
-        # managing time and Ctrl-C
+        # managing time
         #####################
         time_at_loop = time_ns()
         if t == 0
@@ -382,12 +385,17 @@ function lazified_conditional_gradient(
             primal = f(x)
         end
 
-        v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy)
+        v, weak_gap = compute_weak_separation_point(lmo, gradient, threshold)
         tt = lazy
         if fast_dot(v, gradient) > threshold
             tt = dualstep
             dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+            @assert weak_gap == 0
             phi = min(dual_gap, phi / 2)
+        else
+            tt = weak_gap > 0 ? weaksep : regular
+            dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+            phi = min(phi, dual_gap + weak_gap)
         end
 
         d = muladd_memory_mode(memory_mode, d, x, v)
diff --git a/src/moi_oracle.jl b/src/moi_oracle.jl
index 875b8001d..18014a225 100644
--- a/src/moi_oracle.jl
+++ b/src/moi_oracle.jl
@@ -128,7 +128,6 @@ function _optimize_and_return(lmo, variables)
     term_st = MOI.get(lmo.o, MOI.TerminationStatus())
     if term_st ∉ (MOI.OPTIMAL, MOI.ALMOST_OPTIMAL, MOI.SLOW_PROGRESS)
         @error "Unexpected termination: $term_st"
-        return MOI.get.(lmo.o, MOI.VariablePrimal(), variables)
     end
     return MOI.get.(lmo.o, MOI.VariablePrimal(), variables)
 end
diff --git a/src/pairwise.jl b/src/pairwise.jl
index 1f401405b..16bd2fb80 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -30,6 +30,7 @@ function blended_pairwise_conditional_gradient(
     add_dropped_vertices=false,
     use_extra_vertex_storage=false,
     recompute_last_vertex=true,
+    weak_separation=false,
 )
     # add the first vertex to active set from initialization
     active_set = ActiveSet([(1.0, x0)])
@@ -58,6 +59,7 @@ function blended_pairwise_conditional_gradient(
         add_dropped_vertices=add_dropped_vertices,
         use_extra_vertex_storage=use_extra_vertex_storage,
         recompute_last_vertex=recompute_last_vertex,
+        weak_separation=weak_separation,
     )
 end
 
@@ -90,6 +92,7 @@ function blended_pairwise_conditional_gradient(
     add_dropped_vertices=false,
     use_extra_vertex_storage=false,
     recompute_last_vertex=true,
+    weak_separation=false,
 )
 
     # format string for output of the algorithm
@@ -137,7 +140,7 @@ function blended_pairwise_conditional_gradient(
             "MEMORY_MODE: $memory_mode STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $NumType",
         )
         grad_type = typeof(gradient)
-        println("GRADIENTTYPE: $grad_type LAZY: $lazy lazy_tolerance: $lazy_tolerance")
+        println("GRADIENTTYPE: $grad_type LAZY: $lazy lazy_tolerance: $lazy_tolerance WEAK_SEPARATION: $weak_separation")
         println("Linear Minimization Oracle: $(typeof(lmo))")
         if use_extra_vertex_storage && !lazy
             @info("vertex storage only used in lazy mode")
@@ -201,14 +204,24 @@ function blended_pairwise_conditional_gradient(
         local_gap = dot_away_vertex - dot_forward_vertex
         if !lazy
             if t > 1
-                v = compute_extreme_point(lmo, gradient)
-                dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v)
-                phi = dual_gap
+                dot_x = fast_dot(gradient, x)
+                (v, weak_gap) = if weak_separation
+                    # we need a separation point v
+                    # ⟨∇f(x), x-v⟩ ≥ local_gap * lazy_tolerance
+                    # ⟨∇f(x), v⟩ ≤ ⟨∇f(x), x⟩ - local_gap * lazy_tolerance
+                    threshold = dot_x - local_gap * lazy_tolerance
+                    compute_weak_separation_point(lmo, gradient, threshold)
+                else
+                    v = compute_extreme_point(lmo, gradient)
+                    (v, zero(phi))
+                end
+                dual_gap = dot_x - fast_dot(gradient, v)
+                phi = dual_gap + weak_gap
             end
         end
         # minor modification from original paper for improved sparsity
         # (proof follows with minor modification when estimating the step)
-        if local_gap ≥ phi / lazy_tolerance
+        if local_gap ≥ phi / lazy_tolerance # pairwise step
             d = muladd_memory_mode(memory_mode, d, a, v_local)
             vertex_taken = v_local
             gamma_max = a_lambda
@@ -274,15 +287,28 @@ function blended_pairwise_conditional_gradient(
                         v = new_forward_vertex
                         tt = lazylazy
                     else
-                        v = compute_extreme_point(lmo, gradient)
-                        tt = regular
+                        (v, gap) = if weak_separation
+                            compute_weak_separation_point(lmo, gradient, lazy_threshold)
+                        else
+                            (compute_extreme_point(lmo, gradient), 0.0)
+                        end
+                        tt = gap == 0.0 ? regular : weaksep
                     end
                 else
                     # for t == 1, v is already computed before first iteration
-                    if t > 1
-                        v = compute_extreme_point(lmo, gradient)
+                    if t == 1
+                        gap = 0.0
+                    else
+                        (v, gap) = if weak_separation
+                            lazy_threshold = fast_dot(gradient, x) - phi / lazy_tolerance
+                            compute_weak_separation_point(lmo, gradient, lazy_threshold)
+                        else
+                            v = compute_extreme_point(lmo, gradient)
+                            gap = 0.0
+                            (v, gap)
+                        end
                     end
-                    tt = regular
+                    tt = gap == 0.0 ? regular : weaksep
                 end
             end
             vertex_taken = v
@@ -305,7 +331,7 @@ function blended_pairwise_conditional_gradient(
             # - for lazy: we also accept slightly weaker vertices, those satisfying phi / lazy_tolerance
             # this should simplify the criterion.
             # DO NOT CHANGE without good reason and talk to Sebastian first for the logic behind this.
-            if !lazy || dual_gap ≥ phi / lazy_tolerance                  
+            if !lazy || dual_gap ≥ phi / lazy_tolerance
                 d = muladd_memory_mode(memory_mode, d, x, v)
 
                 gamma = perform_line_search(
@@ -360,9 +386,10 @@ function blended_pairwise_conditional_gradient(
             else # dual step
                 # set to computed dual_gap for consistency between the lazy and non-lazy run.
                 # that is ok as we scale with the K = 2.0 default anyways
-                # we only update the dual gap if the step was regular (not lazy from discarded set)
+                # we only update the dual gap if the step was regular or weaksep (not lazy from discarded set)
                 if tt != lazylazy
-                    phi = dual_gap
+                    @assert dual_gap + gap < phi
+                    phi = dual_gap + gap
                     @debug begin
                         @assert tt == regular
                         v2 = compute_extreme_point(lmo, gradient)
diff --git a/src/tracking.jl b/src/tracking.jl
index 949806fbf..205043cc1 100644
--- a/src/tracking.jl
+++ b/src/tracking.jl
@@ -55,7 +55,12 @@ end
 
 function compute_extreme_point(lmo::TrackingLMO, x; kwargs...)
     lmo.counter += 1
-    return compute_extreme_point(lmo.lmo, x)
+    return compute_extreme_point(lmo.lmo, x; kwargs...)
+end
+
+function compute_weak_separation_point(lmo::TrackingLMO, direction, max_value)
+    lmo.counter += 1
+    return compute_weak_separation_point(lmo.lmo, direction, max_value)
 end
 
 is_tracking_lmo(lmo) = false
diff --git a/test/runtests.jl b/test/runtests.jl
index 1d0c9c4c9..fa9c9b29e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -785,3 +785,10 @@ include("generic-arrays.jl")
         end
     end
 end
+
+module WeakSeparation
+using Test
+@testset "weak separation oracles" begin
+    include("weak_separation.jl")
+end
+end
diff --git a/test/weak_separation.jl b/test/weak_separation.jl
new file mode 100644
index 000000000..b7c214f9d
--- /dev/null
+++ b/test/weak_separation.jl
@@ -0,0 +1,117 @@
+using FrankWolfe
+using LinearAlgebra
+using Test
+
+"""
+A test LMO representing the feasible set [0,1]^n.
+`nops` tracks the number of operations performed.
+"""
+mutable struct Hypercube <: FrankWolfe.LinearMinimizationOracle
+    nops::Int
+end
+
+Hypercube() = Hypercube(0)
+
+function FrankWolfe.compute_extreme_point(lmo::Hypercube, direction::AbstractArray{T}) where {T}
+    v = similar(direction, T)
+    v .= 0
+    for idx in eachindex(v)
+        if direction[idx] < 0
+            v[idx] = 1
+        end
+    end
+    lmo.nops += length(v)
+    return v
+end
+
+# a very naive implementation of a weak separation oracle without a gap (a heuristic)
+# we stop iterating through the indices whenever we obtain sufficient progress
+function FrankWolfe.compute_weak_separation_point(lmo::Hypercube, direction::AbstractArray{T}, max_value) where {T}
+    v = similar(direction, T)
+    v .= 0
+    res = zero(T)
+    gap = zero(T)
+    nops = 0
+    for idx in eachindex(v)
+        nops += 1
+        if direction[idx] < 0
+            v[idx] = 1
+            res += direction[idx]
+        end
+        if res <= max_value && idx != lastindex(v)
+            gap = T(Inf)
+            break
+        end
+    end
+    lmo.nops += nops
+    return v, gap
+end
+
+@testset "Basic behaviour" begin
+    lmo = Hypercube()
+    direction = rand(4,2)
+    v = FrankWolfe.compute_extreme_point(lmo, direction)
+    @test size(v) == size(direction)
+    @test norm(v) == 0
+    (w, _) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1)
+    @test norm(w) == 0
+    direction .*= -1
+    direction[end] -= 1
+    v = FrankWolfe.compute_extreme_point(lmo, direction)
+    @test v == ones(size(direction))
+    (w, gap0) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1)
+    @test dot(w, direction) <= -1
+    @test gap0 == Inf
+    (w, gap1) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1.5)
+    @test dot(w, direction) <= -1.5
+    @test gap1 == Inf
+    # asking for too much progress results in exact LMO call
+    (w, gap3) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1000)
+    @test gap3 == 0.0
+    @test w == v
+end
+
+@testset "AFW and BPCG with weak separation" begin
+    n = 1000
+    # reference point to get an optimum with many vertices
+    ref_point = [0.4 + idx / n * mod(idx, 2) for idx in 1:n]
+    f(x) = 1/2 / n * sum((x[i] - ref_point[i])^2 for i in eachindex(x))
+    function grad!(storage, x)
+        storage .= x
+        storage .-= ref_point
+        storage ./= n
+    end
+    x0 = FrankWolfe.compute_extreme_point(Hypercube(), -ones(n))
+    for lazy in (false, true)
+        tracking_lmo = FrankWolfe.TrackingLMO(Hypercube())
+        x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.away_frank_wolfe(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy)
+        tracking_weak = FrankWolfe.TrackingLMO(Hypercube())
+        x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy)
+        @test tracking_lmo.counter < tracking_weak.counter
+
+        tracking_lmo = FrankWolfe.TrackingLMO(Hypercube())
+        x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy)
+        tracking_weak = FrankWolfe.TrackingLMO(Hypercube())
+        x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy)
+        @test tracking_lmo.counter < tracking_weak.counter
+    end
+end
+
+@testset "Lazy-FW with weak separation" begin
+    n = 1000
+    # reference point to get an optimum with many vertices
+    ref_point = [0.4 + idx / n * mod(idx, 2) for idx in 1:n]
+    f(x) = 1/2 / n * sum((x[i] - ref_point[i])^2 for i in eachindex(x))
+    function grad!(storage, x)
+        storage .= x
+        storage .-= ref_point
+        storage ./= n
+    end
+    x0 = FrankWolfe.compute_extreme_point(Hypercube(), -ones(n))
+    tracking_lmo = FrankWolfe.TrackingLMO(Hypercube())
+    x, v, primal, dual_gap, trajectory_exact = FrankWolfe.lazified_conditional_gradient(f, grad!, tracking_lmo, x0, verbose=true, weak_separation=false)
+    x0 = FrankWolfe.compute_extreme_point(Hypercube(), -ones(n))
+    tracking_weak = FrankWolfe.TrackingLMO(Hypercube())
+    x, v, primal, dual_gap, trajectory_weak = FrankWolfe.lazified_conditional_gradient(f, grad!, tracking_weak, x0, verbose=true, weak_separation=true)
+    @test tracking_lmo.counter < tracking_weak.counter
+end