From e277f4100c5488d428b53803f1492a5a8733739f Mon Sep 17 00:00:00 2001 From: mattplo Date: Wed, 7 Jul 2021 12:41:01 +0200 Subject: [PATCH 1/9] LMO implementation & basic tests --- Project.toml | 1 + src/afw.jl | 12 ++++---- src/blended_cg.jl | 8 +++--- src/fw_algorithms.jl | 12 ++++---- src/oracles.jl | 66 +++++++++++++++++++++++++++++++++++++++++--- test/lmo.jl | 46 ++++++++++++++++++++++++++++++ 6 files changed, 125 insertions(+), 20 deletions(-) diff --git a/Project.toml b/Project.toml index 07dd95d94..919235c1a 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.1.4" Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78" FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" +GLPK = "60bf3e95-4087-53dc-ae20-288a0d20c6a6" Hungarian = "e91730f6-4275-51fb-a7a0-7064cfbd3b39" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee" diff --git a/src/afw.jl b/src/afw.jl index 904a65584..6d98208d0 100644 --- a/src/afw.jl +++ b/src/afw.jl @@ -89,7 +89,7 @@ function away_frank_wolfe( x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient,x=x) phi_value = fast_dot(x, gradient) - fast_dot(v, gradient) while t <= max_iteration && dual_gap >= max(epsilon, eps()) @@ -216,7 +216,7 @@ function away_frank_wolfe( if verbose x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient,x=x) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) tt = last @@ -238,7 +238,7 @@ function away_frank_wolfe( active_set_cleanup!(active_set) x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) if verbose @@ -289,7 +289,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) index = a_loc #Resort to calling the LMO else - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x) # Real dual gap promises enough progress. grad_dot_fw_vertex = fast_dot(v, gradient) dual_gap = grad_dot_x - grad_dot_fw_vertex @@ -321,7 +321,7 @@ function afw_step(x, gradient, lmo, active_set) local_v_lambda, local_v, local_v_loc, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient) away_gap = fast_dot(a, gradient) - fast_dot(x, gradient) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x) grad_dot_x = fast_dot(x, gradient) away_gap = fast_dot(a, gradient) - grad_dot_x dual_gap = grad_dot_x - fast_dot(v, gradient) @@ -346,7 +346,7 @@ function afw_step(x, gradient, lmo, active_set) end function fw_step(x, gradient, lmo) - vertex = compute_extreme_point(lmo, gradient) + vertex = compute_extreme_point(lmo, gradient, x=x) return ( x - vertex, vertex, diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 3b0ca1769..60d34a628 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -52,7 +52,7 @@ function blended_conditional_gradient( primal = f(x) grad!(gradient, x) # initial gap estimate computation - vmax = compute_extreme_point(lmo, gradient) + vmax = compute_extreme_point(lmo, gradient, x=x) phi = fast_dot(gradient, x0 - vmax) / 2 dual_gap = phi traj_data = [] @@ -247,7 +247,7 @@ function blended_conditional_gradient( if verbose x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) tot_time = (time_ns() - time_start) / 1e9 @@ -271,7 +271,7 @@ function blended_conditional_gradient( active_set_renormalize!(active_set) x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x) primal = f(x) #dual_gap = 2phi dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) @@ -1029,7 +1029,7 @@ function lp_separation_oracle( end end # otherwise, call the LMO - y = compute_extreme_point(lmo, direction; kwargs...) + y = compute_extreme_point(lmo, direction; x=x, kwargs...) # don't return nothing but y, fast_dot(direction, y) / use y for step outside / and update phi as in LCG (lines 402 - 406) return (y, fast_dot(direction, y)) end diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index d8feb368a..ffc02c098 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -132,7 +132,7 @@ function frank_wolfe( end first_iter = false - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient,x=x) # go easy on the memory - only compute if really needed if ( (mod(t, print_iter) == 0 && verbose) || @@ -201,7 +201,7 @@ function frank_wolfe( # hence the final computation. grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient,x=x) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) if verbose @@ -347,7 +347,7 @@ function lazified_conditional_gradient( primal = f(x) end - v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy) + v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy,x=x) tt = lazy if fast_dot(v, gradient) > threshold tt = dualstep @@ -413,7 +413,7 @@ function lazified_conditional_gradient( # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting # hence the final computation. grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient,x=x) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) @@ -559,7 +559,7 @@ function stochastic_frank_wolfe( end first_iter = false - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient,x=x) # go easy on the memory - only compute if really needed if (mod(t, print_iter) == 0 && verbose) || @@ -622,7 +622,7 @@ function stochastic_frank_wolfe( # last computation done with full evaluation for exact gradient (primal, gradient) = compute_value_gradient(f, x, full_evaluation=true) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient,x=x) # @show (gradient, primal) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) if verbose diff --git a/src/oracles.jl b/src/oracles.jl index bdcb387ca..c3936d8f6 100644 --- a/src/oracles.jl +++ b/src/oracles.jl @@ -158,7 +158,7 @@ function compute_extreme_point( end end end - if best_idx > 0 # && fast_dot(best_v, direction) ≤ threshold + if best_idx > 0 # && fast_dot(best_v, direction) ≤ threshold # println("cache sol") return best_v end @@ -241,10 +241,10 @@ function compute_extreme_point( if best_idx < 0 v = compute_extreme_point(lmo.inner, direction) if store_cache - # note: we do not check for duplicates. hence you might end up with more vertices, + # note: we do not check for duplicates. hence you might end up with more vertices, # in fact up to number of dual steps many, that might be already in the cache - # in order to reach this point, if v was already in the cache is must not meet the threshold (otherwise we would have returned it) - # and it is the best possible, hence we will perform a dual step on the outside. + # in order to reach this point, if v was already in the cache is must not meet the threshold (otherwise we would have returned it) + # and it is the best possible, hence we will perform a dual step on the outside. # # note: another possibility could be to test against that in the if statement but then you might end you recalculating the same vertex a few times. # as such this might be a better tradeoff, i.e., to not check the set for duplicates and potentially accept #dualSteps many duplicates. @@ -303,3 +303,61 @@ function compute_extreme_point( end return storage end + +""" +""" +struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T} <: LinearMinimizationOracle + inner::LMO + max_rounds_number::Int + improv_tol::T +end + +""" +""" +function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...) + d = 0 * direction + Λ = 0 + #@debug x + #@debug direction + if norm(direction) == 0 + return compute_extreme_point(lmo.inner, direction, x=x, kwargs...) + end + flag = true + function align(d1, d2) + if norm(d1) == 0 || norm(d2) == 0 + return -1 + else + return fast_dot(d1, d2) / (norm(d1) * norm(d2)) + end + end + residual = -direction - d + for round in 1:lmo.max_rounds_number + @debug round + #@debug residual + v = compute_extreme_point(lmo.inner, -residual; x=x, kwargs...) + u = v - x + #@debug v + #@debug v - x + d_norm = norm(d) + if d_norm > 0 && fast_dot(-residual, -d / d_norm) < fast_dot(-residual, u) + u = -d / d_norm + flag = false + end + λ = fast_dot(residual, u) / (norm(u)^2) + d_updated = d + λ * u + if align(-direction, d_updated) - align(-direction, d) >= lmo.improv_tol + d = d_updated + residual = -direction - d + if flag + Λ += λ + else + Λ = Λ * (1 - λ / d_norm) + end + flag = true + else + break + end + end + #@debug x + d / Λ + return x + d / Λ +end diff --git a/test/lmo.jl b/test/lmo.jl index 4cbd92a14..98af39504 100644 --- a/test/lmo.jl +++ b/test/lmo.jl @@ -532,3 +532,49 @@ end vvec = FrankWolfe.compute_extreme_point(lmo, [dinf; d1]; direction_indices=(1:10, 11:15)) @test vvec ≈ [vinf; v1] end + +@testset "ChasingGradient LMO" begin + max_rounds = 100 + improv_tol = 10e-3 + f(x) = norm(x)^2 + function grad!(storage, x) + return storage .= 2x + end + lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1) + lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol) + x0 = FrankWolfe.compute_extreme_point(lmo_prob, zeros(5)) + @show x0 + res = FrankWolfe.frank_wolfe( + f, + grad!, + lmo, + x0, + max_iteration=10, + line_search=FrankWolfe.Agnostic(), + verbose=true, + ) + @show res +end + +@testset "ChasingGradient LMO 2" begin + max_rounds = 100 + improv_tol = 10e-3 + f(x) = norm(x)^2 + function grad!(storage, x) + return storage .= 2x + end + lmo_norm = FrankWolfe.LpNormLMO{1}(1) + lmo = FrankWolfe.ChasingGradientLMO(lmo_norm, max_rounds, improv_tol) + x0 = ones(5) + @show x0 + res = FrankWolfe.frank_wolfe( + f, + grad!, + lmo, + x0, + max_iteration=10, + line_search=FrankWolfe.Agnostic(), + verbose=true, + ) + @show res +end From c56a2b417c46bf7b61911f4bbbc031fa1ee02fe4 Mon Sep 17 00:00:00 2001 From: mattplo Date: Thu, 8 Jul 2021 11:04:26 +0200 Subject: [PATCH 2/9] fixes, add kwargs --- Project.toml | 1 - src/afw.jl | 21 +++++++++--------- src/blended_cg.jl | 6 ++--- src/fw_algorithms.jl | 22 ++++++++++++++----- src/oracles.jl | 37 ++++++++++++++++++------------- test/lmo.jl | 52 ++++++++++++++++++-------------------------- 6 files changed, 73 insertions(+), 66 deletions(-) diff --git a/Project.toml b/Project.toml index 919235c1a..07dd95d94 100644 --- a/Project.toml +++ b/Project.toml @@ -7,7 +7,6 @@ version = "0.1.4" Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78" FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" -GLPK = "60bf3e95-4087-53dc-ae20-288a0d20c6a6" Hungarian = "e91730f6-4275-51fb-a7a0-7064cfbd3b39" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee" diff --git a/src/afw.jl b/src/afw.jl index 6d98208d0..47435686b 100644 --- a/src/afw.jl +++ b/src/afw.jl @@ -32,6 +32,7 @@ function away_frank_wolfe( callback=nothing, timeout=Inf, print_callback=FrankWolfe.print_callback, + kwargs..., ) # format string for output of the algorithm @@ -89,7 +90,7 @@ function away_frank_wolfe( x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient,x=x) + v = compute_extreme_point(lmo, gradient, x=x, kwargs...) phi_value = fast_dot(x, gradient) - fast_dot(v, gradient) while t <= max_iteration && dual_gap >= max(epsilon, eps()) @@ -216,7 +217,7 @@ function away_frank_wolfe( if verbose x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient,x=x) + v = compute_extreme_point(lmo, gradient, x=x, kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) tt = last @@ -238,7 +239,7 @@ function away_frank_wolfe( active_set_cleanup!(active_set) x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient, x=x) + v = compute_extreme_point(lmo, gradient, x=x, kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) if verbose @@ -263,7 +264,7 @@ end function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) v_lambda, v, v_loc, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient) - #Do lazy FW step + # Do lazy FW step grad_dot_lazy_fw_vertex = fast_dot(v, gradient) grad_dot_x = fast_dot(x, gradient) grad_dot_a = fast_dot(a, gradient) @@ -277,7 +278,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) fw_step_taken = true index = v_loc else - #Do away step, as it promises enough progress. + # Do away step, as it promises enough progress. if grad_dot_a - grad_dot_x > grad_dot_x - grad_dot_lazy_fw_vertex && grad_dot_a - grad_dot_x >= phi / K tt = away @@ -287,9 +288,9 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) away_step_taken = true fw_step_taken = false index = a_loc - #Resort to calling the LMO + # Resort to calling the LMO else - v = compute_extreme_point(lmo, gradient, x=x) + v = compute_extreme_point(lmo, gradient, x=x, kwargs...) # Real dual gap promises enough progress. grad_dot_fw_vertex = fast_dot(v, gradient) dual_gap = grad_dot_x - grad_dot_fw_vertex @@ -301,7 +302,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) away_step_taken = false fw_step_taken = true index = nothing - #Lower our expectation for progress. + # Lower our expectation for progress. else tt = dualstep phi = min(dual_gap, phi / 2.0) @@ -321,7 +322,7 @@ function afw_step(x, gradient, lmo, active_set) local_v_lambda, local_v, local_v_loc, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient) away_gap = fast_dot(a, gradient) - fast_dot(x, gradient) - v = compute_extreme_point(lmo, gradient, x=x) + v = compute_extreme_point(lmo, gradient, x=x, kwargs...) grad_dot_x = fast_dot(x, gradient) away_gap = fast_dot(a, gradient) - grad_dot_x dual_gap = grad_dot_x - fast_dot(v, gradient) @@ -346,7 +347,7 @@ function afw_step(x, gradient, lmo, active_set) end function fw_step(x, gradient, lmo) - vertex = compute_extreme_point(lmo, gradient, x=x) + vertex = compute_extreme_point(lmo, gradient, x=x, kwargs...) return ( x - vertex, vertex, diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 60d34a628..8fe2b47cc 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -52,7 +52,7 @@ function blended_conditional_gradient( primal = f(x) grad!(gradient, x) # initial gap estimate computation - vmax = compute_extreme_point(lmo, gradient, x=x) + vmax = compute_extreme_point(lmo, gradient, x=x, lmo_kwargs...) phi = fast_dot(gradient, x0 - vmax) / 2 dual_gap = phi traj_data = [] @@ -247,7 +247,7 @@ function blended_conditional_gradient( if verbose x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient, x=x) + v = compute_extreme_point(lmo, gradient, x=x, lmo_kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) tot_time = (time_ns() - time_start) / 1e9 @@ -271,7 +271,7 @@ function blended_conditional_gradient( active_set_renormalize!(active_set) x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient, x=x) + v = compute_extreme_point(lmo, gradient, x=x, lmo_kwargs...) primal = f(x) #dual_gap = 2phi dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index ffc02c098..b4e976570 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -31,6 +31,7 @@ function frank_wolfe( callback=nothing, timeout=Inf, print_callback=FrankWolfe.print_callback, + kwargs..., ) # format string for output of the algorithm @@ -132,7 +133,7 @@ function frank_wolfe( end first_iter = false - v = compute_extreme_point(lmo, gradient,x=x) + v = compute_extreme_point(lmo, gradient, x=x, kwargs...) # go easy on the memory - only compute if really needed if ( (mod(t, print_iter) == 0 && verbose) || @@ -201,7 +202,7 @@ function frank_wolfe( # hence the final computation. grad!(gradient, x) - v = compute_extreme_point(lmo, gradient,x=x) + v = compute_extreme_point(lmo, gradient, x=x, kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) if verbose @@ -256,6 +257,7 @@ function lazified_conditional_gradient( callback=nothing, timeout=Inf, print_callback=FrankWolfe.print_callback, + kwargs..., ) # format string for output of the algorithm @@ -347,7 +349,14 @@ function lazified_conditional_gradient( primal = f(x) end - v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy,x=x) + v = compute_extreme_point( + lmo, + gradient, + threshold=threshold, + greedy=greedy_lazy, + x=x, + kwargs..., + ) tt = lazy if fast_dot(v, gradient) > threshold tt = dualstep @@ -413,7 +422,7 @@ function lazified_conditional_gradient( # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting # hence the final computation. grad!(gradient, x) - v = compute_extreme_point(lmo, gradient,x=x) + v = compute_extreme_point(lmo, gradient, x=x, kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) @@ -465,6 +474,7 @@ function stochastic_frank_wolfe( callback=nothing, timeout=Inf, print_callback=FrankWolfe.print_callback, + kwargs..., ) # format string for output of the algorithm @@ -559,7 +569,7 @@ function stochastic_frank_wolfe( end first_iter = false - v = compute_extreme_point(lmo, gradient,x=x) + v = compute_extreme_point(lmo, gradient, x=x, kwargs...) # go easy on the memory - only compute if really needed if (mod(t, print_iter) == 0 && verbose) || @@ -622,7 +632,7 @@ function stochastic_frank_wolfe( # last computation done with full evaluation for exact gradient (primal, gradient) = compute_value_gradient(f, x, full_evaluation=true) - v = compute_extreme_point(lmo, gradient,x=x) + v = compute_extreme_point(lmo, gradient, x=x, kwargs...) # @show (gradient, primal) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) if verbose diff --git a/src/oracles.jl b/src/oracles.jl index c3936d8f6..2223c9cb9 100644 --- a/src/oracles.jl +++ b/src/oracles.jl @@ -305,6 +305,14 @@ function compute_extreme_point( end """ + ChasingGradientLMO{LMO,T} + +Oracle wrapping another one of type LMO to boost FW with gradient pursuit. +Pursuit rounds end once the alignment improvement with the direction of the gradient +is lower than `improv_tol` or `max_rounds_number` is reached. +See the [paper](https://arxiv.org/abs/2003.06369) + +All keyword arguments are passed to the inner LMO. """ struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T} <: LinearMinimizationOracle inner::LMO @@ -312,32 +320,27 @@ struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T} <: LinearMinimization improv_tol::T end -""" -""" function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...) d = 0 * direction - Λ = 0 - #@debug x - #@debug direction - if norm(direction) == 0 + Λ = 0.0 + if norm(direction) <= eps(eltype(direction)) return compute_extreme_point(lmo.inner, direction, x=x, kwargs...) end flag = true function align(d1, d2) - if norm(d1) == 0 || norm(d2) == 0 - return -1 + if norm(d1) <= eps(eltype(d1)) || norm(d2) <= eps(eltype(d2)) + return -1.0 else return fast_dot(d1, d2) / (norm(d1) * norm(d2)) end end residual = -direction - d for round in 1:lmo.max_rounds_number - @debug round - #@debug residual - v = compute_extreme_point(lmo.inner, -residual; x=x, kwargs...) + if norm(residual) <= eps(eltype(residual)) + break + end + v = compute_extreme_point(lmo.inner, -residual; kwargs...) u = v - x - #@debug v - #@debug v - x d_norm = norm(d) if d_norm > 0 && fast_dot(-residual, -d / d_norm) < fast_dot(-residual, u) u = -d / d_norm @@ -358,6 +361,10 @@ function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...) break end end - #@debug x + d / Λ - return x + d / Λ + + if Λ <= eps(eltype(Λ)) + return x + else + return x + d / Λ + end end diff --git a/test/lmo.jl b/test/lmo.jl index 98af39504..a22151a51 100644 --- a/test/lmo.jl +++ b/test/lmo.jl @@ -533,48 +533,38 @@ end @test vvec ≈ [vinf; v1] end -@testset "ChasingGradient LMO" begin - max_rounds = 100 - improv_tol = 10e-3 - f(x) = norm(x)^2 - function grad!(storage, x) - return storage .= 2x - end - lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1) - lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol) - x0 = FrankWolfe.compute_extreme_point(lmo_prob, zeros(5)) - @show x0 - res = FrankWolfe.frank_wolfe( - f, - grad!, - lmo, - x0, - max_iteration=10, - line_search=FrankWolfe.Agnostic(), - verbose=true, - ) - @show res -end -@testset "ChasingGradient LMO 2" begin +@testset "Chasing Gradient LMO" begin max_rounds = 100 improv_tol = 10e-3 - f(x) = norm(x)^2 + xp = ones(5) + f(x) = norm(x - xp)^2 function grad!(storage, x) - return storage .= 2x + @. storage = 2 * (x - xp) + return nothing end - lmo_norm = FrankWolfe.LpNormLMO{1}(1) + lmo_norm = FrankWolfe.LpNormLMO{Float64,1}(1) lmo = FrankWolfe.ChasingGradientLMO(lmo_norm, max_rounds, improv_tol) - x0 = ones(5) - @show x0 - res = FrankWolfe.frank_wolfe( + x0 = zeros(5) + res_boosting = FrankWolfe.frank_wolfe( f, grad!, lmo, x0, - max_iteration=10, + max_iteration=1, line_search=FrankWolfe.Agnostic(), verbose=true, ) - @show res + # x0 = zeros(5) + # res = FrankWolfe.frank_wolfe( + # f, + # grad!, + # lmo_norm, + # x0, + # max_iteration = 1000, + # line_search = FrankWolfe.Agnostic(), + # verbose = true, + # ) + @test abs(res_boosting[3] - 3.2) < 1.0e-5 + end From d21b90ad651542ad4472338b6e34110b402e35f3 Mon Sep 17 00:00:00 2001 From: mattplo Date: Thu, 8 Jul 2021 17:21:01 +0200 Subject: [PATCH 3/9] test with ProbabilitySimplexOracle --- src/oracles.jl | 5 +++-- test/lmo.jl | 43 ++++++++++++++++++++++++------------------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/oracles.jl b/src/oracles.jl index 2223c9cb9..a49f4354f 100644 --- a/src/oracles.jl +++ b/src/oracles.jl @@ -314,7 +314,7 @@ See the [paper](https://arxiv.org/abs/2003.06369) All keyword arguments are passed to the inner LMO. """ -struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T} <: LinearMinimizationOracle +struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T,G} <: LinearMinimizationOracle inner::LMO max_rounds_number::Int improv_tol::T @@ -337,6 +337,7 @@ function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...) residual = -direction - d for round in 1:lmo.max_rounds_number if norm(residual) <= eps(eltype(residual)) + #@show round break end v = compute_extreme_point(lmo.inner, -residual; kwargs...) @@ -358,10 +359,10 @@ function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...) end flag = true else + #@show round break end end - if Λ <= eps(eltype(Λ)) return x else diff --git a/test/lmo.jl b/test/lmo.jl index a22151a51..1db23a1e1 100644 --- a/test/lmo.jl +++ b/test/lmo.jl @@ -535,36 +535,41 @@ end @testset "Chasing Gradient LMO" begin - max_rounds = 100 + max_rounds = 1 improv_tol = 10e-3 - xp = ones(5) + n = Int(1e5) + xpi = rand(n) + total = sum(xpi) + xp = xpi ./ total f(x) = norm(x - xp)^2 function grad!(storage, x) @. storage = 2 * (x - xp) return nothing end - lmo_norm = FrankWolfe.LpNormLMO{Float64,1}(1) - lmo = FrankWolfe.ChasingGradientLMO(lmo_norm, max_rounds, improv_tol) - x0 = zeros(5) + lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1.0) + x00 = FrankWolfe.compute_extreme_point(lmo_prob, zeros(n)) + d=zeros(n) + lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol,d) + x0 = deepcopy(x00) res_boosting = FrankWolfe.frank_wolfe( f, grad!, lmo, x0, - max_iteration=1, - line_search=FrankWolfe.Agnostic(), + max_iteration=500, + print_iter=50, + line_search=FrankWolfe.Adaptive(), + verbose=true, + ) + x0 = deepcopy(x00) + res = FrankWolfe.frank_wolfe( + f, + grad!, + lmo_prob, + x0, + max_iteration=500, + print_iter=50, + line_search=FrankWolfe.Adaptive(), verbose=true, ) - # x0 = zeros(5) - # res = FrankWolfe.frank_wolfe( - # f, - # grad!, - # lmo_norm, - # x0, - # max_iteration = 1000, - # line_search = FrankWolfe.Agnostic(), - # verbose = true, - # ) - @test abs(res_boosting[3] - 3.2) < 1.0e-5 - end From 86b186fce71d45e716789b567dacd3c4cb2162ae Mon Sep 17 00:00:00 2001 From: mattplo Date: Thu, 8 Jul 2021 17:28:23 +0200 Subject: [PATCH 4/9] cleanup --- src/oracles.jl | 2 +- test/lmo.jl | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/oracles.jl b/src/oracles.jl index a49f4354f..75d94e73a 100644 --- a/src/oracles.jl +++ b/src/oracles.jl @@ -314,7 +314,7 @@ See the [paper](https://arxiv.org/abs/2003.06369) All keyword arguments are passed to the inner LMO. """ -struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T,G} <: LinearMinimizationOracle +struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T} <: LinearMinimizationOracle inner::LMO max_rounds_number::Int improv_tol::T diff --git a/test/lmo.jl b/test/lmo.jl index 1db23a1e1..d77ce6055 100644 --- a/test/lmo.jl +++ b/test/lmo.jl @@ -548,8 +548,7 @@ end end lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1.0) x00 = FrankWolfe.compute_extreme_point(lmo_prob, zeros(n)) - d=zeros(n) - lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol,d) + lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol) x0 = deepcopy(x00) res_boosting = FrankWolfe.frank_wolfe( f, From 4af4f5deefe28972df8cae9cc5f199e11417165e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Thu, 8 Jul 2021 22:25:33 +0200 Subject: [PATCH 5/9] Update src/oracles.jl --- src/oracles.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/oracles.jl b/src/oracles.jl index 75d94e73a..636442c66 100644 --- a/src/oracles.jl +++ b/src/oracles.jl @@ -365,7 +365,6 @@ function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...) end if Λ <= eps(eltype(Λ)) return x - else - return x + d / Λ end + return x + d / Λ end From 87d29d64c80c3de0104acab02dfeb4065095e77f Mon Sep 17 00:00:00 2001 From: mattplo Date: Thu, 15 Jul 2021 17:27:10 +0200 Subject: [PATCH 6/9] improvements & tests --- src/FrankWolfe.jl | 3 +- src/afw.jl | 24 +++++----- src/blended_cg.jl | 6 +-- src/fw_algorithms.jl | 18 ++++--- src/oracles.jl | 106 +++++++++++++++++++++++++++++------------ src/simplex_oracles.jl | 2 +- test/lmo.jl | 84 ++++++++++++++++++-------------- test/temp.jl | 61 ++++++++++++++++++++++++ 8 files changed, 213 insertions(+), 91 deletions(-) create mode 100644 test/temp.jl diff --git a/src/FrankWolfe.jl b/src/FrankWolfe.jl index 0f5eee244..ccd0054b1 100644 --- a/src/FrankWolfe.jl +++ b/src/FrankWolfe.jl @@ -1,5 +1,6 @@ module FrankWolfe +using Plots: SparseArrays using LinearAlgebra using Printf using ProgressMeter @@ -41,7 +42,7 @@ include("fw_algorithms.jl") # collecting most common data types etc and precompile # min version req set to 1.5 to prevent stalling of julia 1 -@static if VERSION >= v"1.5" +@static if VERSION >= v"1.5" println("Precompiling common signatures. This might take a moment...") include("precompile.jl") end diff --git a/src/afw.jl b/src/afw.jl index 47435686b..92d55f507 100644 --- a/src/afw.jl +++ b/src/afw.jl @@ -90,7 +90,7 @@ function away_frank_wolfe( x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient, x=x, kwargs...) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) phi_value = fast_dot(x, gradient) - fast_dot(v, gradient) while t <= max_iteration && dual_gap >= max(epsilon, eps()) @@ -128,14 +128,14 @@ function away_frank_wolfe( if away_steps if lazy d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt = - lazy_afw_step(x, gradient, lmo, active_set, phi_value; K=K) + lazy_afw_step(x, gradient, lmo, active_set, phi_value; K=K, kwargs...) else d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt = - afw_step(x, gradient, lmo, active_set) + afw_step(x, gradient, lmo, active_set; kwargs...) end else d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt = - fw_step(x, gradient, lmo) + fw_step(x, gradient, lmo; kwargs...) end @@ -217,7 +217,7 @@ function away_frank_wolfe( if verbose x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient, x=x, kwargs...) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) tt = last @@ -239,7 +239,7 @@ function away_frank_wolfe( active_set_cleanup!(active_set) x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient, x=x, kwargs...) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) if verbose @@ -262,7 +262,7 @@ function away_frank_wolfe( return x, v, primal, dual_gap, traj_data, active_set end -function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) +function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0, kwargs...) v_lambda, v, v_loc, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient) # Do lazy FW step grad_dot_lazy_fw_vertex = fast_dot(v, gradient) @@ -290,7 +290,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) index = a_loc # Resort to calling the LMO else - v = compute_extreme_point(lmo, gradient, x=x, kwargs...) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) # Real dual gap promises enough progress. grad_dot_fw_vertex = fast_dot(v, gradient) dual_gap = grad_dot_x - grad_dot_fw_vertex @@ -318,11 +318,11 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) return d, vertex, index, gamma_max, phi, away_step_taken, fw_step_taken, tt end -function afw_step(x, gradient, lmo, active_set) +function afw_step(x, gradient, lmo, active_set; kwargs...) local_v_lambda, local_v, local_v_loc, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient) away_gap = fast_dot(a, gradient) - fast_dot(x, gradient) - v = compute_extreme_point(lmo, gradient, x=x, kwargs...) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) grad_dot_x = fast_dot(x, gradient) away_gap = fast_dot(a, gradient) - grad_dot_x dual_gap = grad_dot_x - fast_dot(v, gradient) @@ -346,8 +346,8 @@ function afw_step(x, gradient, lmo, active_set) return d, vertex, index, gamma_max, dual_gap, away_step_taken, fw_step_taken, tt end -function fw_step(x, gradient, lmo) - vertex = compute_extreme_point(lmo, gradient, x=x, kwargs...) +function fw_step(x, gradient, lmo; kwargs...) + vertex = compute_extreme_point(lmo, gradient, x=x; kwargs...) return ( x - vertex, vertex, diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 8fe2b47cc..f01af2dbb 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -52,7 +52,7 @@ function blended_conditional_gradient( primal = f(x) grad!(gradient, x) # initial gap estimate computation - vmax = compute_extreme_point(lmo, gradient, x=x, lmo_kwargs...) + vmax = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...) phi = fast_dot(gradient, x0 - vmax) / 2 dual_gap = phi traj_data = [] @@ -247,7 +247,7 @@ function blended_conditional_gradient( if verbose x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient, x=x, lmo_kwargs...) + v = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) tot_time = (time_ns() - time_start) / 1e9 @@ -271,7 +271,7 @@ function blended_conditional_gradient( active_set_renormalize!(active_set) x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient, x=x, lmo_kwargs...) + v = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...) primal = f(x) #dual_gap = 2phi dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index b4e976570..0258fbc59 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -133,7 +133,8 @@ function frank_wolfe( end first_iter = false - v = compute_extreme_point(lmo, gradient, x=x, kwargs...) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) + #@show typeof(v) # go easy on the memory - only compute if really needed if ( (mod(t, print_iter) == 0 && verbose) || @@ -173,6 +174,9 @@ function frank_wolfe( step_lim, one(eltype(x)), ) + #@show typeof(x) + #@show typeof(d) + #@show typeof(x-gamma*d) @emphasis(emphasis, x = x - gamma * d) @@ -202,7 +206,9 @@ function frank_wolfe( # hence the final computation. grad!(gradient, x) - v = compute_extreme_point(lmo, gradient, x=x, kwargs...) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) + #@show v + #@show typeof(v) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) if verbose @@ -354,7 +360,7 @@ function lazified_conditional_gradient( gradient, threshold=threshold, greedy=greedy_lazy, - x=x, + x=x; kwargs..., ) tt = lazy @@ -422,7 +428,7 @@ function lazified_conditional_gradient( # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting # hence the final computation. grad!(gradient, x) - v = compute_extreme_point(lmo, gradient, x=x, kwargs...) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) @@ -569,7 +575,7 @@ function stochastic_frank_wolfe( end first_iter = false - v = compute_extreme_point(lmo, gradient, x=x, kwargs...) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) # go easy on the memory - only compute if really needed if (mod(t, print_iter) == 0 && verbose) || @@ -632,7 +638,7 @@ function stochastic_frank_wolfe( # last computation done with full evaluation for exact gradient (primal, gradient) = compute_value_gradient(f, x, full_evaluation=true) - v = compute_extreme_point(lmo, gradient, x=x, kwargs...) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) # @show (gradient, primal) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) if verbose diff --git a/src/oracles.jl b/src/oracles.jl index 75d94e73a..31fe7f03f 100644 --- a/src/oracles.jl +++ b/src/oracles.jl @@ -61,7 +61,7 @@ function compute_extreme_point( return v end end - v = compute_extreme_point(lmo.inner, direction, kwargs...) + v = compute_extreme_point(lmo.inner, direction; kwargs...) if store_cache lmo.last_vertex = v end @@ -165,7 +165,7 @@ function compute_extreme_point( end # no interesting point found, computing new # println("LP sol") - v = compute_extreme_point(lmo.inner, direction, kwargs...) + v = compute_extreme_point(lmo.inner, direction; kwargs...) if store_cache tup = Base.setindex(lmo.vertices, v, lmo.oldest_idx) lmo.vertices = tup @@ -308,64 +308,108 @@ end ChasingGradientLMO{LMO,T} Oracle wrapping another one of type LMO to boost FW with gradient pursuit. -Pursuit rounds end once the alignment improvement with the direction of the gradient +Pursuit rounds end once the alignment improvement with the direction of the gradient is lower than `improv_tol` or `max_rounds_number` is reached. See the [paper](https://arxiv.org/abs/2003.06369) All keyword arguments are passed to the inner LMO. """ -struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T} <: LinearMinimizationOracle +mutable struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T,G,G2} <: LinearMinimizationOracle inner::LMO max_rounds_number::Int improv_tol::T + d::G + u::G + residual::G2 + m_residual::G2 +end + +function ChasingGradientLMO(lmo, max_rounds_number, improv_tol, d, residual) + return ChasingGradientLMO( + lmo, + max_rounds_number, + improv_tol, + d, + copy(d), + residual, + copy(residual), + ) +end + +function _zero!(d) + @. d = 0 +end + +function _zero!(d::SparseArrays.AbstractSparseArray) + @. d = 0 + return SparseArrays.dropzeros!(d) +end + +function _inplace_plus(d, v) + @. d += v +end + +function _inplace_plus(d, v::ScaledHotVector) + return d[v.val_idx] += v.active_val end function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...) - d = 0 * direction - Λ = 0.0 - if norm(direction) <= eps(eltype(direction)) - return compute_extreme_point(lmo.inner, direction, x=x, kwargs...) + _zero!(lmo.d) + v_ret = 0 * lmo.d + Λ = zero(eltype(direction)) + norm_direction = norm(direction) + if norm_direction <= eps(eltype(direction)) + v_ret .+= compute_extreme_point(lmo.inner, direction, x=x, kwargs...) + return v_ret end - flag = true - function align(d1, d2) - if norm(d1) <= eps(eltype(d1)) || norm(d2) <= eps(eltype(d2)) - return -1.0 + use_v = true + function align(d) + if norm(d) <= eps(eltype(d)) + return -one(eltype(d)) else - return fast_dot(d1, d2) / (norm(d1) * norm(d2)) + return -fast_dot(direction, d) / (norm_direction * norm(d)) end end - residual = -direction - d + @. lmo.residual = -direction + @. lmo.m_residual = direction for round in 1:lmo.max_rounds_number - if norm(residual) <= eps(eltype(residual)) + if norm(lmo.residual) <= eps(eltype(lmo.residual)) #@show round break end - v = compute_extreme_point(lmo.inner, -residual; kwargs...) - u = v - x - d_norm = norm(d) - if d_norm > 0 && fast_dot(-residual, -d / d_norm) < fast_dot(-residual, u) - u = -d / d_norm - flag = false + v = compute_extreme_point(lmo.inner, lmo.m_residual; kwargs...) + #@. lmo.u = v - x + @. lmo.u = -x + _inplace_plus(lmo.u, v) + d_norm = norm(lmo.d) + if d_norm > 0 && fast_dot(lmo.residual, lmo.d / d_norm) < -fast_dot(lmo.residual, lmo.u) + @. lmo.u = -lmo.d / d_norm + use_v = false end - λ = fast_dot(residual, u) / (norm(u)^2) - d_updated = d + λ * u - if align(-direction, d_updated) - align(-direction, d) >= lmo.improv_tol - d = d_updated - residual = -direction - d - if flag + λ = fast_dot(lmo.residual, lmo.u) / (norm(lmo.u)^2) + d_updated = lmo.u + @. d_updated = lmo.d + λ * lmo.u #2 allocs + if align(d_updated) - align(lmo.d) >= lmo.improv_tol + @. lmo.d = d_updated + # @. lmo.residual = -direction - lmo.d + @. lmo.residual -= λ * lmo.u + @. lmo.m_residual = -lmo.residual + if use_v Λ += λ else Λ = Λ * (1 - λ / d_norm) end - flag = true + use_v = true else #@show round break end end if Λ <= eps(eltype(Λ)) - return x - else - return x + d / Λ + v_ret .+= compute_extreme_point(lmo.inner, direction, x=x, kwargs...) + return v_ret end + @. v_ret += x + @. v_ret += lmo.d / Λ #2 allocations + return v_ret end diff --git a/src/simplex_oracles.jl b/src/simplex_oracles.jl index 34d5407cd..e1e96e411 100644 --- a/src/simplex_oracles.jl +++ b/src/simplex_oracles.jl @@ -21,7 +21,7 @@ LMO for scaled unit simplex: Returns either vector of zeros or vector with one active value equal to RHS if there exists an improving direction. """ -function compute_extreme_point(lmo::UnitSimplexOracle{T}, direction) where {T} +function compute_extreme_point(lmo::UnitSimplexOracle{T}, direction; kwargs...) where {T} idx = argmin(direction) if direction[idx] < 0 return ScaledHotVector(lmo.right_side, idx, length(direction)) diff --git a/test/lmo.jl b/test/lmo.jl index d77ce6055..5492bb050 100644 --- a/test/lmo.jl +++ b/test/lmo.jl @@ -1,7 +1,7 @@ using Test using FrankWolfe using LinearAlgebra -import SparseArrays +using SparseArrays import FrankWolfe: compute_extreme_point, LpNormLMO, KSparseLMO @@ -519,7 +519,7 @@ end end @testset "Product LMO" begin - # + # lmo = FrankWolfe.ProductLMO(FrankWolfe.LpNormLMO{Inf}(3.0), FrankWolfe.LpNormLMO{1}(2.0)) dinf = randn(10) d1 = randn(5) @@ -535,40 +535,50 @@ end @testset "Chasing Gradient LMO" begin - max_rounds = 1 - improv_tol = 10e-3 - n = Int(1e5) - xpi = rand(n) - total = sum(xpi) - xp = xpi ./ total - f(x) = norm(x - xp)^2 - function grad!(storage, x) - @. storage = 2 * (x - xp) - return nothing + @testset "Consistent results with 1 round" begin + max_rounds = 1 + improv_tol = 1e-3 + n = Int(1e5) + xpi = rand(n) + total = sum(xpi) + xp = xpi ./ total + f(x) = norm(x - xp)^2 + function grad!(storage, x) + @. storage = 2 * (x - xp) + return nothing + end + d = spzeros(n) + residual = zeros(n) + for lmo_inner in [ + FrankWolfe.LpNormLMO{1}(1.0), + FrankWolfe.LpNormLMO{1}(1.0), + FrankWolfe.ProbabilitySimplexOracle(1.0), + ] + x00 = FrankWolfe.compute_extreme_point(lmo_inner, zeros(n)) + lmo = FrankWolfe.ChasingGradientLMO(lmo_inner, max_rounds, improv_tol, d, residual) + x0 = sparse(deepcopy(x00)) + res_boosting = FrankWolfe.frank_wolfe( + f, + grad!, + lmo, + x0, + max_iteration=500, + gradient=zeros(n), + line_search=FrankWolfe.Adaptive(), + verbose=true, + ) + x0 = deepcopy(x00) + res = FrankWolfe.frank_wolfe( + f, + grad!, + lmo_inner, + x0, + max_iteration=500, + line_search=FrankWolfe.Adaptive(), + verbose=true, + ) + @test isapprox(res_boosting[3], res[3]) + end end - lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1.0) - x00 = FrankWolfe.compute_extreme_point(lmo_prob, zeros(n)) - lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol) - x0 = deepcopy(x00) - res_boosting = FrankWolfe.frank_wolfe( - f, - grad!, - lmo, - x0, - max_iteration=500, - print_iter=50, - line_search=FrankWolfe.Adaptive(), - verbose=true, - ) - x0 = deepcopy(x00) - res = FrankWolfe.frank_wolfe( - f, - grad!, - lmo_prob, - x0, - max_iteration=500, - print_iter=50, - line_search=FrankWolfe.Adaptive(), - verbose=true, - ) + end diff --git a/test/temp.jl b/test/temp.jl new file mode 100644 index 000000000..439d1a3a4 --- /dev/null +++ b/test/temp.jl @@ -0,0 +1,61 @@ + +using Test +using FrankWolfe +using LinearAlgebra + +using SparseArrays + +using ProfileView +using Random +using BenchmarkTools +import FrankWolfe: compute_extreme_point, LpNormLMO, KSparseLMO + +using Plots + + + +function test(lmo, direction, x0, xp) + for _ in 1:100 + FrankWolfe.compute_extreme_point(lmo, direction, x=x0) + end + # res_boosting = FrankWolfe.frank_wolfe( + # f, + # grad!, + # lmo, + # x0, + # gradient=zeros(n), + # max_iteration=500, + # print_iter=50, + # line_search=FrankWolfe.Adaptive(), + # verbose=true, + # ) +end + + +function bench() + max_rounds = 10 + improv_tol = 10e-3 + for nb_dim in [Int(1e2), Int(1e3), Int(1e4), Int(1e5), Int(1e6)] + @show nb_dim + Random.seed!(1234) + xpi = rand(nb_dim) + total = sum(xpi) + xp = xpi ./ total + d = spzeros(nb_dim) + lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1.0) + x0 = sparse(FrankWolfe.compute_extreme_point(lmo_prob, zeros(nb_dim))) + residual = zeros(nb_dim) + direction = rand(nb_dim) + f(x) = norm(x - xp)^2 + function grad!(storage, x) + @. storage = 2 * (x - xp) + return nothing + end + lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol, d, residual) + # @profview test(lmo, direction, x0,xp) + # @profview test(lmo, direction, x0,xp) + @btime test($lmo, $direction, $x0, $xp) + end +end + +bench() From abfd67b25dddfe958aa8c37c3160073e7930fc07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Mon, 19 Jul 2021 17:40:46 +0200 Subject: [PATCH 7/9] remove wrong import --- src/FrankWolfe.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/FrankWolfe.jl b/src/FrankWolfe.jl index ccd0054b1..4c403870f 100644 --- a/src/FrankWolfe.jl +++ b/src/FrankWolfe.jl @@ -1,6 +1,5 @@ module FrankWolfe -using Plots: SparseArrays using LinearAlgebra using Printf using ProgressMeter From 64201944099504c12e19441a037fa417a9d7f72d Mon Sep 17 00:00:00 2001 From: Matthieu <57533715+mattplo@users.noreply.github.com> Date: Tue, 20 Jul 2021 14:52:39 +0200 Subject: [PATCH 8/9] fix remaining merge conflict --- src/fw_algorithms.jl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index b61c989c8..d4740c8ec 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -30,12 +30,8 @@ function frank_wolfe( gradient=nothing, callback=nothing, timeout=Inf, -<<<<<<< HEAD - print_callback=FrankWolfe.print_callback, - kwargs..., -======= print_callback=print_callback, ->>>>>>> master + kwargs..., ) # format string for output of the algorithm From 1a1ad364c9b622381f9713107f28470aff1bc82d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Sat, 7 Aug 2021 18:39:50 +0200 Subject: [PATCH 9/9] fix conflict, minor format --- src/fw_algorithms.jl | 6 +----- src/oracles.jl | 17 ++++++++++------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index b61c989c8..d4740c8ec 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -30,12 +30,8 @@ function frank_wolfe( gradient=nothing, callback=nothing, timeout=Inf, -<<<<<<< HEAD - print_callback=FrankWolfe.print_callback, - kwargs..., -======= print_callback=print_callback, ->>>>>>> master + kwargs..., ) # format string for output of the algorithm diff --git a/src/oracles.jl b/src/oracles.jl index 31fe7f03f..58e59bc59 100644 --- a/src/oracles.jl +++ b/src/oracles.jl @@ -307,12 +307,15 @@ end """ ChasingGradientLMO{LMO,T} -Oracle wrapping another one of type LMO to boost FW with gradient pursuit. -Pursuit rounds end once the alignment improvement with the direction of the gradient +Oracle boosting FW with gradient pursuit: the LMO constructs a direction as +a combination of multiple extreme vertices: +`d_t = ∑_k λ_k (v_k - x_t)` found in successive rounds. +Pursuit rounds terminate once the alignment improvement with the direction of the gradient is lower than `improv_tol` or `max_rounds_number` is reached. -See the [paper](https://arxiv.org/abs/2003.06369) -All keyword arguments are passed to the inner LMO. +The LMO also keeps internal containers `d`, `u`, `residual` and `m_residual` to reduce allocations. +See the [paper](https://arxiv.org/abs/2003.06369). +All keyword arguments to `compute_extreme_point` are passed to the inner LMO. """ mutable struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T,G,G2} <: LinearMinimizationOracle inner::LMO @@ -338,11 +341,13 @@ end function _zero!(d) @. d = 0 + return nothing end function _zero!(d::SparseArrays.AbstractSparseArray) @. d = 0 - return SparseArrays.dropzeros!(d) + SparseArrays.dropzeros!(d) + return nothing end function _inplace_plus(d, v) @@ -374,11 +379,9 @@ function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...) @. lmo.m_residual = direction for round in 1:lmo.max_rounds_number if norm(lmo.residual) <= eps(eltype(lmo.residual)) - #@show round break end v = compute_extreme_point(lmo.inner, lmo.m_residual; kwargs...) - #@. lmo.u = v - x @. lmo.u = -x _inplace_plus(lmo.u, v) d_norm = norm(lmo.d)