diff --git a/src/FrankWolfe.jl b/src/FrankWolfe.jl
index 0f5eee244..4c403870f 100644
--- a/src/FrankWolfe.jl
+++ b/src/FrankWolfe.jl
@@ -41,7 +41,7 @@ include("fw_algorithms.jl")
 
 # collecting most common data types etc and precompile 
 # min version req set to 1.5 to prevent stalling of julia 1
-@static if VERSION >= v"1.5"   
+@static if VERSION >= v"1.5"
     println("Precompiling common signatures. This might take a moment...")
     include("precompile.jl")
 end
diff --git a/src/afw.jl b/src/afw.jl
index 2c4a84c58..c90f75f16 100644
--- a/src/afw.jl
+++ b/src/afw.jl
@@ -32,6 +32,7 @@ function away_frank_wolfe(
     callback=nothing,
     timeout=Inf,
     print_callback=print_callback,
+    kwargs...,
 )
 
     # format string for output of the algorithm
@@ -89,7 +90,7 @@ function away_frank_wolfe(
 
     x = compute_active_set_iterate(active_set)
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
     phi_value = max(0, fast_dot(x, gradient) - fast_dot(v, gradient))
     gamma = 1.0
 
@@ -128,14 +129,14 @@ function away_frank_wolfe(
         if away_steps
             if lazy
                 d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
-                    lazy_afw_step(x, gradient, lmo, active_set, phi_value; K=K)
+                    lazy_afw_step(x, gradient, lmo, active_set, phi_value; K=K, kwargs...)
             else
                 d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
-                    afw_step(x, gradient, lmo, active_set)
+                    afw_step(x, gradient, lmo, active_set; kwargs...)
             end
         else
             d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
-                fw_step(x, gradient, lmo)
+                fw_step(x, gradient, lmo; kwargs...)
         end
 
         if fw_step_taken || away_step_taken
@@ -215,7 +216,7 @@ function away_frank_wolfe(
     if verbose
         x = compute_active_set_iterate(active_set)
         grad!(gradient, x)
-        v = compute_extreme_point(lmo, gradient)
+        v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
         primal = f(x)
         dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
         tt = last
@@ -237,7 +238,7 @@ function away_frank_wolfe(
     active_set_cleanup!(active_set)
     x = compute_active_set_iterate(active_set)
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
     if verbose
@@ -260,9 +261,9 @@ function away_frank_wolfe(
     return x, v, primal, dual_gap, traj_data, active_set
 end
 
-function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
+function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0, kwargs...)
     v_lambda, v, v_loc, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient)
-    #Do lazy FW step
+    # Do lazy FW step
     grad_dot_lazy_fw_vertex = fast_dot(v, gradient)
     grad_dot_x = fast_dot(x, gradient)
     grad_dot_a = fast_dot(a, gradient)
@@ -276,7 +277,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
         fw_step_taken = true
         index = v_loc
     else
-        #Do away step, as it promises enough progress.
+        # Do away step, as it promises enough progress.
         if grad_dot_a - grad_dot_x > grad_dot_x - grad_dot_lazy_fw_vertex &&
            grad_dot_a - grad_dot_x >= phi / K
             tt = away
@@ -286,9 +287,9 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
             away_step_taken = true
             fw_step_taken = false
             index = a_loc
-            #Resort to calling the LMO
+            # Resort to calling the LMO
         else
-            v = compute_extreme_point(lmo, gradient)
+            v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
             # Real dual gap promises enough progress.
             grad_dot_fw_vertex = fast_dot(v, gradient)
             dual_gap = grad_dot_x - grad_dot_fw_vertex
@@ -300,7 +301,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
                 away_step_taken = false
                 fw_step_taken = true
                 index = nothing
-                #Lower our expectation for progress.
+                # Lower our expectation for progress.
             else
                 tt = dualstep
                 phi = min(dual_gap, phi / 2.0)
@@ -316,11 +317,11 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
     return d, vertex, index, gamma_max, phi, away_step_taken, fw_step_taken, tt
 end
 
-function afw_step(x, gradient, lmo, active_set)
+function afw_step(x, gradient, lmo, active_set; kwargs...)
     local_v_lambda, local_v, local_v_loc, a_lambda, a, a_loc =
         active_set_argminmax(active_set, gradient)
     away_gap = fast_dot(a, gradient) - fast_dot(x, gradient)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
     grad_dot_x = fast_dot(x, gradient)
     away_gap = fast_dot(a, gradient) - grad_dot_x
     dual_gap = grad_dot_x - fast_dot(v, gradient)
@@ -344,8 +345,8 @@ function afw_step(x, gradient, lmo, active_set)
     return d, vertex, index, gamma_max, dual_gap, away_step_taken, fw_step_taken, tt
 end
 
-function fw_step(x, gradient, lmo)
-    vertex = compute_extreme_point(lmo, gradient)
+function fw_step(x, gradient, lmo; kwargs...)
+    vertex = compute_extreme_point(lmo, gradient, x=x; kwargs...)
     return (
         x - vertex,
         vertex,
diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index cd51de342..cf6547fc8 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -51,7 +51,7 @@ function blended_conditional_gradient(
     primal = f(x)
     grad!(gradient, x)
     # initial gap estimate computation
-    vmax = compute_extreme_point(lmo, gradient)
+    vmax = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...)
     phi = fast_dot(gradient, x0 - vmax) / 2
     dual_gap = phi
     traj_data = []
@@ -243,7 +243,7 @@ function blended_conditional_gradient(
     if verbose
         x = compute_active_set_iterate(active_set)
         grad!(gradient, x)
-        v = compute_extreme_point(lmo, gradient)
+        v = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...)
         primal = f(x)
         dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
         tot_time = (time_ns() - time_start) / 1e9
@@ -267,7 +267,7 @@ function blended_conditional_gradient(
     active_set_renormalize!(active_set)
     x = compute_active_set_iterate(active_set)
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...)
     primal = f(x)
     #dual_gap = 2phi
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
@@ -1025,7 +1025,7 @@ function lp_separation_oracle(
         end
     end
     # otherwise, call the LMO
-    y = compute_extreme_point(lmo, direction; kwargs...)
+    y = compute_extreme_point(lmo, direction; x=x, kwargs...)
     # don't return nothing but y, fast_dot(direction, y) / use y for step outside / and update phi as in LCG (lines 402 - 406)
     return (y, fast_dot(direction, y))
 end
diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index 5799356e7..d4740c8ec 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -31,6 +31,7 @@ function frank_wolfe(
     callback=nothing,
     timeout=Inf,
     print_callback=print_callback,
+    kwargs...,
 )
 
     # format string for output of the algorithm
@@ -131,7 +132,8 @@ function frank_wolfe(
         end
         first_iter = false
 
-        v = compute_extreme_point(lmo, gradient)
+        v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
+        #@show typeof(v)
         # go easy on the memory - only compute if really needed
         if (
             (mod(t, print_iter) == 0 && verbose) ||
@@ -159,6 +161,9 @@ function frank_wolfe(
             step_lim,
             one(eltype(x)),
         )
+        #@show typeof(x)
+        #@show typeof(d)
+        #@show typeof(x-gamma*d)
         if callback !== nothing
             state = (
                 t=t,
@@ -201,7 +206,9 @@ function frank_wolfe(
     # hence the final computation.
 
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
+    #@show v
+    #@show typeof(v)
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
     if verbose
@@ -256,6 +263,7 @@ function lazified_conditional_gradient(
     callback=nothing,
     timeout=Inf,
     print_callback=print_callback,
+    kwargs...,
 )
 
     # format string for output of the algorithm
@@ -351,7 +359,14 @@ function lazified_conditional_gradient(
             primal = f(x)
         end
 
-        v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy)
+        v = compute_extreme_point(
+            lmo,
+            gradient,
+            threshold=threshold,
+            greedy=greedy_lazy,
+            x=x;
+            kwargs...,
+        )
         tt = lazy
         if fast_dot(v, gradient) > threshold
             tt = dualstep
@@ -418,7 +433,7 @@ function lazified_conditional_gradient(
     # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
     # hence the final computation.
     grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
 
@@ -470,6 +485,7 @@ function stochastic_frank_wolfe(
     callback=nothing,
     timeout=Inf,
     print_callback=print_callback,
+    kwargs...,
 )
 
     # format string for output of the algorithm
@@ -568,7 +584,7 @@ function stochastic_frank_wolfe(
         end
         first_iter = false
 
-        v = compute_extreme_point(lmo, gradient)
+        v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
 
         # go easy on the memory - only compute if really needed
         if (mod(t, print_iter) == 0 && verbose) ||
@@ -632,7 +648,7 @@ function stochastic_frank_wolfe(
     # last computation done with full evaluation for exact gradient
 
     (primal, gradient) = compute_value_gradient(f, x, full_evaluation=true)
-    v = compute_extreme_point(lmo, gradient)
+    v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
     # @show (gradient, primal)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
     if verbose
diff --git a/src/oracles.jl b/src/oracles.jl
index bdcb387ca..58e59bc59 100644
--- a/src/oracles.jl
+++ b/src/oracles.jl
@@ -61,7 +61,7 @@ function compute_extreme_point(
             return v
         end
     end
-    v = compute_extreme_point(lmo.inner, direction, kwargs...)
+    v = compute_extreme_point(lmo.inner, direction; kwargs...)
     if store_cache
         lmo.last_vertex = v
     end
@@ -158,14 +158,14 @@ function compute_extreme_point(
                 end
             end
         end
-        if best_idx > 0 # && fast_dot(best_v, direction) ≤ threshold 
+        if best_idx > 0 # && fast_dot(best_v, direction) ≤ threshold
             # println("cache sol")
             return best_v
         end
     end
     # no interesting point found, computing new
     # println("LP sol")
-    v = compute_extreme_point(lmo.inner, direction, kwargs...)
+    v = compute_extreme_point(lmo.inner, direction; kwargs...)
     if store_cache
         tup = Base.setindex(lmo.vertices, v, lmo.oldest_idx)
         lmo.vertices = tup
@@ -241,10 +241,10 @@ function compute_extreme_point(
     if best_idx < 0
         v = compute_extreme_point(lmo.inner, direction)
         if store_cache
-            # note: we do not check for duplicates. hence you might end up with more vertices, 
+            # note: we do not check for duplicates. hence you might end up with more vertices,
             # in fact up to number of dual steps many, that might be already in the cache
-            # in order to reach this point, if v was already in the cache is must not meet the threshold (otherwise we would have returned it) 
-            # and it is the best possible, hence we will perform a dual step on the outside. 
+            # in order to reach this point, if v was already in the cache is must not meet the threshold (otherwise we would have returned it)
+            # and it is the best possible, hence we will perform a dual step on the outside.
             #
             # note: another possibility could be to test against that in the if statement but then you might end you recalculating the same vertex a few times.
             # as such this might be a better tradeoff, i.e., to not check the set for duplicates and potentially accept #dualSteps many duplicates.
@@ -303,3 +303,116 @@ function compute_extreme_point(
     end
     return storage
 end
+
+"""
+    ChasingGradientLMO{LMO,T}
+
+Oracle boosting FW with gradient pursuit: the LMO constructs a direction as
+a combination of multiple extreme vertices:
+`d_t = ∑_k λ_k (v_k - x_t)` found in successive rounds.
+Pursuit rounds terminate once the alignment improvement with the direction of the gradient
+is lower than `improv_tol` or `max_rounds_number` is reached.
+
+The LMO also keeps internal containers `d`, `u`, `residual` and `m_residual` to reduce allocations.
+See the [paper](https://arxiv.org/abs/2003.06369).
+All keyword arguments to `compute_extreme_point` are passed to the inner LMO.
+"""
+mutable struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T,G,G2} <: LinearMinimizationOracle
+    inner::LMO
+    max_rounds_number::Int
+    improv_tol::T
+    d::G
+    u::G
+    residual::G2
+    m_residual::G2
+end
+
+function ChasingGradientLMO(lmo, max_rounds_number, improv_tol, d, residual)
+    return ChasingGradientLMO(
+        lmo,
+        max_rounds_number,
+        improv_tol,
+        d,
+        copy(d),
+        residual,
+        copy(residual),
+    )
+end
+
+function _zero!(d)
+    @. d = 0
+    return nothing
+end
+
+function _zero!(d::SparseArrays.AbstractSparseArray)
+    @. d = 0
+    SparseArrays.dropzeros!(d)
+    return nothing
+end
+
+function _inplace_plus(d, v)
+    @. d += v
+end
+
+function _inplace_plus(d, v::ScaledHotVector)
+    return d[v.val_idx] += v.active_val
+end
+
+function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...)
+    _zero!(lmo.d)
+    v_ret = 0 * lmo.d
+    Λ = zero(eltype(direction))
+    norm_direction = norm(direction)
+    if norm_direction <= eps(eltype(direction))
+        v_ret .+= compute_extreme_point(lmo.inner, direction, x=x, kwargs...)
+        return v_ret
+    end
+    use_v = true
+    function align(d)
+        if norm(d) <= eps(eltype(d))
+            return -one(eltype(d))
+        else
+            return -fast_dot(direction, d) / (norm_direction * norm(d))
+        end
+    end
+    @. lmo.residual = -direction
+    @. lmo.m_residual = direction
+    for round in 1:lmo.max_rounds_number
+        if norm(lmo.residual) <= eps(eltype(lmo.residual))
+            break
+        end
+        v = compute_extreme_point(lmo.inner, lmo.m_residual; kwargs...)
+        @. lmo.u = -x
+        _inplace_plus(lmo.u, v)
+        d_norm = norm(lmo.d)
+        if d_norm > 0 && fast_dot(lmo.residual, lmo.d / d_norm) < -fast_dot(lmo.residual, lmo.u)
+            @. lmo.u = -lmo.d / d_norm
+            use_v = false
+        end
+        λ = fast_dot(lmo.residual, lmo.u) / (norm(lmo.u)^2)
+        d_updated = lmo.u
+        @. d_updated = lmo.d + λ * lmo.u #2 allocs
+        if align(d_updated) - align(lmo.d) >= lmo.improv_tol
+            @. lmo.d = d_updated
+            # @. lmo.residual = -direction - lmo.d
+            @. lmo.residual -= λ * lmo.u
+            @. lmo.m_residual = -lmo.residual
+            if use_v
+                Λ += λ
+            else
+                Λ = Λ * (1 - λ / d_norm)
+            end
+            use_v = true
+        else
+            #@show round
+            break
+        end
+    end
+    if Λ <= eps(eltype(Λ))
+        v_ret .+= compute_extreme_point(lmo.inner, direction, x=x, kwargs...)
+        return v_ret
+    end
+    @. v_ret += x
+    @. v_ret += lmo.d / Λ #2 allocations
+    return v_ret
+end
diff --git a/src/simplex_oracles.jl b/src/simplex_oracles.jl
index 34d5407cd..e1e96e411 100644
--- a/src/simplex_oracles.jl
+++ b/src/simplex_oracles.jl
@@ -21,7 +21,7 @@ LMO for scaled unit simplex:
 Returns either vector of zeros or vector with one active value equal to RHS if
 there exists an improving direction.
 """
-function compute_extreme_point(lmo::UnitSimplexOracle{T}, direction) where {T}
+function compute_extreme_point(lmo::UnitSimplexOracle{T}, direction; kwargs...) where {T}
     idx = argmin(direction)
     if direction[idx] < 0
         return ScaledHotVector(lmo.right_side, idx, length(direction))
diff --git a/test/lmo.jl b/test/lmo.jl
index 4cbd92a14..5492bb050 100644
--- a/test/lmo.jl
+++ b/test/lmo.jl
@@ -1,7 +1,7 @@
 using Test
 using FrankWolfe
 using LinearAlgebra
-import SparseArrays
+using SparseArrays
 
 import FrankWolfe: compute_extreme_point, LpNormLMO, KSparseLMO
 
@@ -519,7 +519,7 @@ end
 end
 
 @testset "Product LMO" begin
-    # 
+    #
     lmo = FrankWolfe.ProductLMO(FrankWolfe.LpNormLMO{Inf}(3.0), FrankWolfe.LpNormLMO{1}(2.0))
     dinf = randn(10)
     d1 = randn(5)
@@ -532,3 +532,53 @@ end
     vvec = FrankWolfe.compute_extreme_point(lmo, [dinf; d1]; direction_indices=(1:10, 11:15))
     @test vvec ≈ [vinf; v1]
 end
+
+
+@testset "Chasing Gradient LMO" begin
+    @testset "Consistent results with 1 round" begin
+        max_rounds = 1
+        improv_tol = 1e-3
+        n = Int(1e5)
+        xpi = rand(n)
+        total = sum(xpi)
+        xp = xpi ./ total
+        f(x) = norm(x - xp)^2
+        function grad!(storage, x)
+            @. storage = 2 * (x - xp)
+            return nothing
+        end
+        d = spzeros(n)
+        residual = zeros(n)
+        for lmo_inner in [
+            FrankWolfe.LpNormLMO{1}(1.0),
+            FrankWolfe.LpNormLMO{1}(1.0),
+            FrankWolfe.ProbabilitySimplexOracle(1.0),
+        ]
+            x00 = FrankWolfe.compute_extreme_point(lmo_inner, zeros(n))
+            lmo = FrankWolfe.ChasingGradientLMO(lmo_inner, max_rounds, improv_tol, d, residual)
+            x0 = sparse(deepcopy(x00))
+            res_boosting = FrankWolfe.frank_wolfe(
+                f,
+                grad!,
+                lmo,
+                x0,
+                max_iteration=500,
+                gradient=zeros(n),
+                line_search=FrankWolfe.Adaptive(),
+                verbose=true,
+            )
+            x0 = deepcopy(x00)
+            res = FrankWolfe.frank_wolfe(
+                f,
+                grad!,
+                lmo_inner,
+                x0,
+                max_iteration=500,
+                line_search=FrankWolfe.Adaptive(),
+                verbose=true,
+            )
+            @test isapprox(res_boosting[3], res[3])
+        end
+    end
+
+end
diff --git a/test/temp.jl b/test/temp.jl
new file mode 100644
index 000000000..439d1a3a4
--- /dev/null
+++ b/test/temp.jl
@@ -0,0 +1,61 @@
+
+using Test
+using FrankWolfe
+using LinearAlgebra
+
+using SparseArrays
+
+using ProfileView
+using Random
+using BenchmarkTools
+import FrankWolfe: compute_extreme_point, LpNormLMO, KSparseLMO
+
+using Plots
+
+
+
+function test(lmo, direction, x0, xp)
+    for _ in 1:100
+        FrankWolfe.compute_extreme_point(lmo, direction, x=x0)
+    end
+    # res_boosting = FrankWolfe.frank_wolfe(
+    #     f,
+    #     grad!,
+    #     lmo,
+    #     x0,
+    #     gradient=zeros(n),
+    #     max_iteration=500,
+    #     print_iter=50,
+    #     line_search=FrankWolfe.Adaptive(),
+    #     verbose=true,
+    # )
+end
+
+
+function bench()
+    max_rounds = 10
+    improv_tol = 10e-3
+    for nb_dim in [Int(1e2), Int(1e3), Int(1e4), Int(1e5), Int(1e6)]
+        @show nb_dim
+        Random.seed!(1234)
+        xpi = rand(nb_dim)
+        total = sum(xpi)
+        xp = xpi ./ total
+        d = spzeros(nb_dim)
+        lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1.0)
+        x0 = sparse(FrankWolfe.compute_extreme_point(lmo_prob, zeros(nb_dim)))
+        residual = zeros(nb_dim)
+        direction = rand(nb_dim)
+        f(x) = norm(x - xp)^2
+        function grad!(storage, x)
+            @. storage = 2 * (x - xp)
+            return nothing
+        end
+        lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol, d, residual)
+        # @profview test(lmo, direction, x0,xp)
+        # @profview test(lmo, direction, x0,xp)
+        @btime test($lmo, $direction, $x0, $xp)
+    end
+end
+
+bench()