From cc53d1b10b9be4ac453568b188d18a113e75f168 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Mon, 31 Jul 2023 15:28:15 +0200 Subject: [PATCH 01/16] weak separation oracle --- src/abstract_oracles.jl | 102 ++++++++++++++++++++++++++++++++++++++-- src/fw_algorithms.jl | 21 ++++++--- 2 files changed, 111 insertions(+), 12 deletions(-) diff --git a/src/abstract_oracles.jl b/src/abstract_oracles.jl index 2fe2f9e9e..3a5af8786 100644 --- a/src/abstract_oracles.jl +++ b/src/abstract_oracles.jl @@ -17,6 +17,29 @@ All LMOs should accept keyword arguments that they can ignore. """ function compute_extreme_point end +""" + compute_weak_separation_point(lmo, direction, max_value) -> (vertex, gap) + +Weak separation algorithm for a given oracle. +Unlike `compute_extreme_point`, `compute_weak_separation_point` may provide a suboptimal `vertex` with the following conditions: +- `vertex` is still a valid extreme point of the polytope. +IF an inexact vertex is computed: +- `⟨v, d⟩ ≤ max_value`, the pre-specified required improvement. +- `⟨v, d⟩ ≤ ⟨v_opt, d⟩ + gap`, with `v_opt` a vertex computed with the exact oracle. +- If the algorithm used to compute the inexact vertex provides a bound on optimality, the `gap` value must be valid. +- Otherwise, e.g. if the vertex is computed with a heuristic, `gap = ∞`. +ELSE (the oracle computes an optimal vertex): +- `⟨v, d⟩` may be greater than `max_value`, `gap` must be 0. +""" +function compute_weak_separation_point(lmo, direction, max_value; kwargs...) end + +# default to computing an exact vertex. +function compute_weak_separation_point(lmo::LinearMinimizationOracle, direction, max_value; kwargs...) + v = compute_extreme_point(lmo, direction; kwargs...) + gap = zero(eltype(v)) * zero(eltype(direction)) + return v, gap +end + """ CachedLinearMinimizationOracle{LMO} @@ -43,11 +66,29 @@ Vertices of `LMO` have to be of type `VT` if provided. mutable struct SingleLastCachedLMO{LMO,A} <: CachedLinearMinimizationOracle{LMO} last_vertex::Union{Nothing,A} inner::LMO + store_cache::Bool end # initializes with no cache by default SingleLastCachedLMO(lmo::LMO) where {LMO<:LinearMinimizationOracle} = - SingleLastCachedLMO{LMO,AbstractVector}(nothing, lmo) + SingleLastCachedLMO{LMO,AbstractVector}(nothing, lmo, true) + +# gap is 0 if exact, ∞ if cached point +function compute_weak_separation_point(lmo::SingleLastCachedLMO, direction, max_value; kwargs...) + if lmo.last_vertex !== nothing && isfinite(max_value) + # cache is a sufficiently-decreasing direction + if fast_dot(lmo.last_vertex, direction) ≤ max_value + T = promote_type(eltype(lmo.last_vertex), eltype(direction)) + return lmo.last_vertex, T(Inf) + end + end + v = compute_extreme_point(lmo.inner, direction, kwargs...) + if lmo.store_cache + lmo.last_vertex = v + end + T = promote_type(eltype(v), eltype(direction)) + return (v, zero(T)) +end function compute_extreme_point( lmo::SingleLastCachedLMO, @@ -62,7 +103,7 @@ function compute_extreme_point( return lmo.last_vertex end end - v = compute_extreme_point(lmo.inner, direction, kwargs...) + v = compute_extreme_point(lmo.inner, direction, v=v, kwargs...) if store_cache lmo.last_vertex = v end @@ -188,14 +229,16 @@ mutable struct VectorCacheLMO{LMO<:LinearMinimizationOracle,VT} <: CachedLinearMinimizationOracle{LMO} vertices::Vector{VT} inner::LMO + store_cache::Bool + greedy::Bool end function VectorCacheLMO{LMO,VT}(lmo::LMO) where {VT,LMO<:LinearMinimizationOracle} - return VectorCacheLMO{LMO,VT}(VT[], lmo) + return VectorCacheLMO{LMO,VT}(VT[], lmo, true, false) end function VectorCacheLMO(lmo::LMO) where {LMO<:LinearMinimizationOracle} - return VectorCacheLMO{LMO,Vector{Float64}}(AbstractVector[], lmo) + return VectorCacheLMO{LMO,Vector{Float64}}(AbstractVector[], lmo, true, false) end function Base.empty!(lmo::VectorCacheLMO) @@ -205,6 +248,55 @@ end Base.length(lmo::VectorCacheLMO) = length(lmo.vertices) +function compute_weak_separation_point(lmo::VectorCacheLMO, direction, max_value; kwargs...) + if isempty(lmo.vertices) + v = compute_extreme_point(lmo.inner, direction) + T = promote_type(eltype(v), eltype(direction)) + if lmo.store_cache + push!(lmo.vertices, v) + end + return v, zero(T) + end + best_idx = -1 + best_val = Inf + best_v = nothing + for idx in reverse(eachindex(lmo.vertices)) + @inbounds v = lmo.vertices[idx] + new_val = fast_dot(v, direction) + if new_val ≤ max_value + T = promote_type(eltype(v), eltype(direction)) + # stop and return + if lmo.greedy + return v, T(Inf) + end + # otherwise, compare to incumbent + if new_val < best_val + best_v = v + best_val = new_val + best_idx = idx + end + end + end + if best_idx > 0 + T = promote_type(eltype(best_v), eltype(direction)) + return best_v, T(Inf) + end + # no satisfactory vertex found + v = compute_extreme_point(lmo.inner, direction) + if lmo.store_cache + # note: we do not check for duplicates. hence you might end up with more vertices, + # in fact up to number of dual steps many, that might be already in the cache + # in order to reach this point, if v was already in the cache is must not meet the threshold (otherwise we would have returned it) + # and it is the best possible, hence we will perform a dual step on the outside. + # + # note: another possibility could be to test against that in the if statement but then you might end you recalculating the same vertex a few times. + # as such this might be a better tradeoff, i.e., to not check the set for duplicates and potentially accept #dual_steps many duplicates. + push!(lmo.vertices, v) + end + T = promote_type(eltype(v), eltype(direction)) + return v, zero(v) +end + function compute_extreme_point( lmo::VectorCacheLMO, direction; @@ -228,7 +320,7 @@ function compute_extreme_point( @inbounds v = lmo.vertices[idx] new_val = fast_dot(v, direction) if new_val ≤ threshold - # stop, store and return + # stop and return if greedy return v end diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index eff4bdb57..9280b37af 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -293,6 +293,7 @@ function lazified_conditional_gradient( end lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base) + lmo.greedy = greedy_lazy if isfinite(cache_size) Base.sizehint!(lmo.vertices, cache_size) end @@ -300,7 +301,7 @@ function lazified_conditional_gradient( t = 0 dual_gap = Inf primal = Inf - v = [] + v = x0 x = x0 phi = Inf tt = regular @@ -353,7 +354,7 @@ function lazified_conditional_gradient( while t <= max_iteration && dual_gap >= max(epsilon, eps(float(eltype(x)))) ##################### - # managing time and Ctrl-C + # managing time ##################### time_at_loop = time_ns() if t == 0 @@ -382,12 +383,18 @@ function lazified_conditional_gradient( primal = f(x) end - v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy) - tt = lazy - if fast_dot(v, gradient) > threshold - tt = dualstep + v, gap = compute_weak_separation_point(lmo, gradient, threshold) + if gap > 0 + tt = lazy + if fast_dot(v, gradient) > threshold + tt = dualstep + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + phi = min(dual_gap, phi / 2) + end + else + tt = regular dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - phi = min(dual_gap, phi / 2) + phi = dual_gap end d = muladd_memory_mode(memory_mode, d, x, v) From 6f3ed3fae0eb0af4146088be9a52b6a2e5c01197 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Mon, 31 Jul 2023 15:36:30 +0200 Subject: [PATCH 02/16] return type --- src/abstract_oracles.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstract_oracles.jl b/src/abstract_oracles.jl index 3a5af8786..ad92a5f12 100644 --- a/src/abstract_oracles.jl +++ b/src/abstract_oracles.jl @@ -87,7 +87,7 @@ function compute_weak_separation_point(lmo::SingleLastCachedLMO, direction, max_ lmo.last_vertex = v end T = promote_type(eltype(v), eltype(direction)) - return (v, zero(T)) + return v, zero(T) end function compute_extreme_point( @@ -294,7 +294,7 @@ function compute_weak_separation_point(lmo::VectorCacheLMO, direction, max_value push!(lmo.vertices, v) end T = promote_type(eltype(v), eltype(direction)) - return v, zero(v) + return v, zero(T) end function compute_extreme_point( From 500deb631903a0ac67dbf8577ebf5f43f6253bf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Mon, 31 Jul 2023 18:21:22 +0200 Subject: [PATCH 03/16] useless duplicate --- src/moi_oracle.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/moi_oracle.jl b/src/moi_oracle.jl index 875b8001d..18014a225 100644 --- a/src/moi_oracle.jl +++ b/src/moi_oracle.jl @@ -128,7 +128,6 @@ function _optimize_and_return(lmo, variables) term_st = MOI.get(lmo.o, MOI.TerminationStatus()) if term_st ∉ (MOI.OPTIMAL, MOI.ALMOST_OPTIMAL, MOI.SLOW_PROGRESS) @error "Unexpected termination: $term_st" - return MOI.get.(lmo.o, MOI.VariablePrimal(), variables) end return MOI.get.(lmo.o, MOI.VariablePrimal(), variables) end From d3a012486844fa4c3c53d41f8a12feb2610061ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Thu, 17 Aug 2023 14:49:21 +0200 Subject: [PATCH 04/16] weak separation in AFW --- src/afw.jl | 57 +++++++++++++++++++-------- src/defs.jl | 2 + src/tracking.jl | 7 +++- test/weak_separation.jl | 85 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 16 deletions(-) create mode 100644 test/weak_separation.jl diff --git a/src/afw.jl b/src/afw.jl index 851aec274..59fe67982 100644 --- a/src/afw.jl +++ b/src/afw.jl @@ -33,6 +33,7 @@ function away_frank_wolfe( use_extra_vertex_storage=false, linesearch_workspace=nothing, recompute_last_vertex=true, + weak_separation=false, ) # add the first vertex to active set from initialization active_set = ActiveSet([(1.0, x0)]) @@ -64,6 +65,7 @@ function away_frank_wolfe( use_extra_vertex_storage=use_extra_vertex_storage, linesearch_workspace=linesearch_workspace, recompute_last_vertex=recompute_last_vertex, + weak_separation=weak_separation, ) end @@ -95,6 +97,7 @@ function away_frank_wolfe( use_extra_vertex_storage=false, linesearch_workspace=nothing, recompute_last_vertex=true, + weak_separation=false, ) # format string for output of the algorithm format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n" @@ -153,7 +156,7 @@ function away_frank_wolfe( ) grad_type = typeof(gradient) println( - "GRADIENTTYPE: $grad_type LAZY: $lazy lazy_tolerance: $lazy_tolerance MOMENTUM: $momentum AWAYSTEPS: $away_steps", + "GRADIENT TYPE: $grad_type LAZY: $lazy LAZY_TOLERANCE: $lazy_tolerance WEAK_SEPARATION: $weak_separation MOMENTUM: $momentum AWAYSTEPS: $away_steps", ) println("Linear Minimization Oracle: $(typeof(lmo))") if (use_extra_vertex_storage || add_dropped_vertices) && extra_vertex_storage === nothing @@ -223,10 +226,16 @@ function away_frank_wolfe( extra_vertex_storage=extra_vertex_storage, lazy_tolerance=lazy_tolerance, memory_mode=memory_mode, + weak_separation=weak_separation, ) else d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt = - afw_step(x, gradient, lmo, active_set, epsilon, d, memory_mode=memory_mode) + afw_step( + x, gradient, lmo, active_set, epsilon, d, + memory_mode=memory_mode, + weak_separation=weak_separation, + lazy_tolerance=lazy_tolerance, + ) end else d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt = @@ -248,7 +257,7 @@ function away_frank_wolfe( memory_mode, ) - gamma = min(gamma_max, gamma) + gamma = min(gamma_max, gamma) # cleanup and renormalize every x iterations. Only for the fw steps. renorm = mod(t, renorm_interval) == 0 if away_step_taken @@ -368,9 +377,9 @@ function away_frank_wolfe( return x, v, primal, dual_gap, traj_data, active_set end -function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_vertex_storage=false, extra_vertex_storage=nothing, lazy_tolerance=2.0, memory_mode::MemoryEmphasis=InplaceEmphasis()) +function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_vertex_storage=false, extra_vertex_storage=nothing, lazy_tolerance=2.0, memory_mode::MemoryEmphasis=InplaceEmphasis(), weak_separation::Bool=true) _, v, v_loc, _, a_lambda, a, a_loc, _, _ = active_set_argminmax(active_set, gradient) - #Do lazy FW step + # do lazy FW step grad_dot_lazy_fw_vertex = fast_dot(v, gradient) grad_dot_x = fast_dot(x, gradient) grad_dot_a = fast_dot(a, gradient) @@ -385,7 +394,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_ fw_step_taken = true index = v_loc else - #Do away step, as it promises enough progress. + # do away step, as it promises enough progress. if grad_dot_a - grad_dot_x > grad_dot_x - grad_dot_lazy_fw_vertex && grad_dot_a - grad_dot_x >= phi / lazy_tolerance tt = away @@ -395,7 +404,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_ away_step_taken = true fw_step_taken = false index = a_loc - #Resort to calling the LMO + # resort to calling the LMO else # optionally: try vertex storage if use_extra_vertex_storage @@ -406,13 +415,20 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_ @debug("Found acceptable lazy vertex in storage") v = new_forward_vertex tt = lazylazy + end + else + found_better_vertex = false + end + if !found_better_vertex + # compute new vertex with normal or weak oracle + if weak_separation + lazy_threshold = fast_dot(gradient, x) - phi / lazy_tolerance + (v, _) = compute_weak_separation_point(lmo, gradient, lazy_threshold) + tt = weaksep else v = compute_extreme_point(lmo, gradient) tt = regular end - else - v = compute_extreme_point(lmo, gradient) - tt = regular end # Real dual gap promises enough progress. grad_dot_fw_vertex = fast_dot(v, gradient) @@ -439,14 +455,25 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_ return d, vertex, index, gamma_max, phi, away_step_taken, fw_step_taken, tt end -function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryEmphasis=InplaceEmphasis()) +function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryEmphasis=InplaceEmphasis(), weak_separation::Bool=false, lazy_tolerance=2.0) _, _, _, _, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient) - v = compute_extreme_point(lmo, gradient) grad_dot_x = fast_dot(x, gradient) away_gap = fast_dot(a, gradient) - grad_dot_x - dual_gap = grad_dot_x - fast_dot(v, gradient) - if dual_gap >= away_gap && dual_gap >= epsilon - tt = regular + (v, gap) = if weak_separation + # Condition for taking a FW step + # ⟨∇f, x-v⟩ ≥ gₐ + # ⟨∇f, v⟩ ≤ ⟨∇f, x⟩ - gₐ + # We ask for a bit more on the FW step + # to promote away steps when we can (and therefore sparsity) + # ⟨∇f, v⟩ ≤ ⟨∇f, x⟩ - K gₐ + lazy_threshold = grad_dot_x - lazy_tolerance * away_gap + compute_weak_separation_point(lmo, gradient, lazy_threshold) + else + (compute_extreme_point(lmo, gradient), 0.0) + end + dual_gap = grad_dot_x - fast_dot(v, gradient) + gap + if dual_gap > away_gap && dual_gap >= epsilon + tt = gap == 0.0 ? regular : weaksep gamma_max = one(a_lambda) d = muladd_memory_mode(memory_mode, d, x, v) vertex = v diff --git a/src/defs.jl b/src/defs.jl index db3c8c0f1..c251b8181 100644 --- a/src/defs.jl +++ b/src/defs.jl @@ -19,6 +19,7 @@ struct OutplaceEmphasis <: MemoryEmphasis end away = 6 pairwise = 7 drop = 8 + weaksep = 9 simplex_descent = 101 gap_step = 102 last = 1000 @@ -34,6 +35,7 @@ const st = ( away="A", pairwise="P", drop="D", + weaksep="W", simplex_descent="SD", gap_step="GS", last="Last", diff --git a/src/tracking.jl b/src/tracking.jl index 949806fbf..205043cc1 100644 --- a/src/tracking.jl +++ b/src/tracking.jl @@ -55,7 +55,12 @@ end function compute_extreme_point(lmo::TrackingLMO, x; kwargs...) lmo.counter += 1 - return compute_extreme_point(lmo.lmo, x) + return compute_extreme_point(lmo.lmo, x; kwargs...) +end + +function compute_weak_separation_point(lmo::TrackingLMO, direction, max_value) + lmo.counter += 1 + return compute_weak_separation_point(lmo.lmo, direction, max_value) end is_tracking_lmo(lmo) = false diff --git a/test/weak_separation.jl b/test/weak_separation.jl new file mode 100644 index 000000000..9894cc402 --- /dev/null +++ b/test/weak_separation.jl @@ -0,0 +1,85 @@ +using FrankWolfe +using LinearAlgebra +using Test + +""" +A test LMO representing the feasible set [0,1]^n +""" +struct Hypercube <: FrankWolfe.LinearMinimizationOracle +end + +function FrankWolfe.compute_extreme_point(::Hypercube, direction::AbstractArray{T}) where {T} + v = similar(direction, T) + v .= 0 + for idx in eachindex(v) + if direction[idx] < 0 + v[idx] = 1 + end + end + return v +end + +# a very naive implementation of a weak separation oracle without a gap (a heuristic) +# we stop iterating through the indices whenever we obtain sufficient progress +function FrankWolfe.compute_weak_separation_point(::Hypercube, direction::AbstractArray{T}, max_value) where {T} + v = similar(direction, T) + v .= 0 + res = zero(T) + gap = zero(T) + for idx in eachindex(v) + if direction[idx] < 0 + v[idx] = 1 + res += direction[idx] + end + if res <= max_value && idx != lastindex(v) + gap = T(Inf) + break + @info "avoided $(length(v) - idx) iterations" + end + end + return v, gap +end + +@testset "Basic behaviour" begin + lmo = Hypercube() + direction = rand(4,2) + v = FrankWolfe.compute_extreme_point(lmo, direction) + @test size(v) == size(direction) + @test norm(v) == 0 + (w, _) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1) + @test norm(w) == 0 + direction .*= -1 + direction[end] -= 1 + v = FrankWolfe.compute_extreme_point(lmo, direction) + @test v == ones(size(direction)) + (w, gap0) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1) + @test dot(w, direction) <= -1 + @test gap0 == Inf + (w, gap1) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1.5) + @test dot(w, direction) <= -1.5 + @test gap1 == Inf + # asking for too much progress results in exact LMO call + (w, gap3) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1000) + @test gap3 == 0.0 + @test w == v +end + +@testset "AFW with weak separation" begin + n = 1000 + # reference point to get an optimum on a face + ref_point = [0.6 + mod(idx, 2) for idx in 1:n] + f(x) = 1/2 / n * sum((x[i] - ref_point[i])^2 for i in eachindex(x)) + function grad!(storage, x) + storage .= x + storage .-= ref_point + storage ./= n + end + lmo = Hypercube() + x0 = FrankWolfe.compute_extreme_point(lmo, -ones(n)) + tracking_lmo = FrankWolfe.TrackingLMO(lmo) + for lazy in (false, true) + x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.away_frank_wolfe(f, grad!, tracking_lmo, x0, verbose=true, weak_separation=false, lazy=lazy) + tracking_weak = FrankWolfe.TrackingLMO(lmo) + x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=true, weak_separation=true, lazy=lazy) + end +end From 0617339a348c71e7e16ae4fecbde5ac051e22afd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Thu, 17 Aug 2023 15:01:53 +0200 Subject: [PATCH 05/16] tests --- docs/src/reference/2_lmo.md | 9 ++++++++- test/runtests.jl | 6 ++++++ test/weak_separation.jl | 7 ++++--- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/docs/src/reference/2_lmo.md b/docs/src/reference/2_lmo.md index b75d936dd..72459b9a2 100644 --- a/docs/src/reference/2_lmo.md +++ b/docs/src/reference/2_lmo.md @@ -5,7 +5,7 @@ The Linear Minimization Oracle (LMO) is a key component called at each iteration v\in \argmin_{x\in \mathcal{C}} \langle d,x \rangle. ``` -See [Combettes, Pokutta 2021](https://arxiv.org/abs/2101.10040) for references on most LMOs +See [Combettes, Pokutta 2021](https://arxiv.org/abs/2101.10040) for references on essential LMOs implemented in the package and their comparison with projection operators. ## Interface and wrappers @@ -19,6 +19,13 @@ All of them are subtypes of [`FrankWolfe.LinearMinimizationOracle`](@ref) and im compute_extreme_point ``` +Optionally, an LMO can implement a weak separation procedure based either on a heuristic or on an approximation algorithm: +```@docs +compute_weak_separation_point +``` + +Weak separation procedures will be used in the methods using an active set and lazified variants only. + We also provide some meta-LMOs wrapping another one with extended behavior: ```@docs FrankWolfe.CachedLinearMinimizationOracle diff --git a/test/runtests.jl b/test/runtests.jl index 1d0c9c4c9..d33b7378c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -785,3 +785,9 @@ include("generic-arrays.jl") end end end + +module WeakSeparation +@testset "weak separation oracles" begin + include("weak_separation.jl") +end +end diff --git a/test/weak_separation.jl b/test/weak_separation.jl index 9894cc402..553084536 100644 --- a/test/weak_separation.jl +++ b/test/weak_separation.jl @@ -76,10 +76,11 @@ end end lmo = Hypercube() x0 = FrankWolfe.compute_extreme_point(lmo, -ones(n)) - tracking_lmo = FrankWolfe.TrackingLMO(lmo) for lazy in (false, true) - x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.away_frank_wolfe(f, grad!, tracking_lmo, x0, verbose=true, weak_separation=false, lazy=lazy) + tracking_lmo = FrankWolfe.TrackingLMO(lmo) + x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.away_frank_wolfe(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy) tracking_weak = FrankWolfe.TrackingLMO(lmo) - x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=true, weak_separation=true, lazy=lazy) + x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy) + @test tracking_lmo.counter <= tracking_weak.counter end end From 4172c516c8b663454cbfd576bcfe58016b565346 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Thu, 17 Aug 2023 17:46:28 +0200 Subject: [PATCH 06/16] include module --- test/runtests.jl | 1 + test/weak_separation.jl | 22 ++++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index d33b7378c..fa9c9b29e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -787,6 +787,7 @@ include("generic-arrays.jl") end module WeakSeparation +using Test @testset "weak separation oracles" begin include("weak_separation.jl") end diff --git a/test/weak_separation.jl b/test/weak_separation.jl index 553084536..9789ca671 100644 --- a/test/weak_separation.jl +++ b/test/weak_separation.jl @@ -3,12 +3,16 @@ using LinearAlgebra using Test """ -A test LMO representing the feasible set [0,1]^n +A test LMO representing the feasible set [0,1]^n. +`nops` tracks the number of operations performed. """ -struct Hypercube <: FrankWolfe.LinearMinimizationOracle +mutable struct Hypercube <: FrankWolfe.LinearMinimizationOracle + nops::Int end -function FrankWolfe.compute_extreme_point(::Hypercube, direction::AbstractArray{T}) where {T} +Hypercube() = Hypercube(0) + +function FrankWolfe.compute_extreme_point(lmo::Hypercube, direction::AbstractArray{T}) where {T} v = similar(direction, T) v .= 0 for idx in eachindex(v) @@ -16,6 +20,7 @@ function FrankWolfe.compute_extreme_point(::Hypercube, direction::AbstractArray{ v[idx] = 1 end end + lmo.nops += length(v) return v end @@ -26,7 +31,9 @@ function FrankWolfe.compute_weak_separation_point(::Hypercube, direction::Abstra v .= 0 res = zero(T) gap = zero(T) + nops = 0 for idx in eachindex(v) + nops += 1 if direction[idx] < 0 v[idx] = 1 res += direction[idx] @@ -34,9 +41,9 @@ function FrankWolfe.compute_weak_separation_point(::Hypercube, direction::Abstra if res <= max_value && idx != lastindex(v) gap = T(Inf) break - @info "avoided $(length(v) - idx) iterations" end end + lmo.nops += nops return v, gap end @@ -74,12 +81,11 @@ end storage .-= ref_point storage ./= n end - lmo = Hypercube() - x0 = FrankWolfe.compute_extreme_point(lmo, -ones(n)) + x0 = FrankWolfe.compute_extreme_point(Hypercube(), -ones(n)) for lazy in (false, true) - tracking_lmo = FrankWolfe.TrackingLMO(lmo) + tracking_lmo = FrankWolfe.TrackingLMO(Hypercube()) x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.away_frank_wolfe(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy) - tracking_weak = FrankWolfe.TrackingLMO(lmo) + tracking_weak = FrankWolfe.TrackingLMO(Hypercube()) x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy) @test tracking_lmo.counter <= tracking_weak.counter end From 2102dfad12050b7b865090288770b5a84131e71d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Thu, 17 Aug 2023 18:24:38 +0200 Subject: [PATCH 07/16] include lmo arg --- test/weak_separation.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/weak_separation.jl b/test/weak_separation.jl index 9789ca671..d21dd49f2 100644 --- a/test/weak_separation.jl +++ b/test/weak_separation.jl @@ -26,7 +26,7 @@ end # a very naive implementation of a weak separation oracle without a gap (a heuristic) # we stop iterating through the indices whenever we obtain sufficient progress -function FrankWolfe.compute_weak_separation_point(::Hypercube, direction::AbstractArray{T}, max_value) where {T} +function FrankWolfe.compute_weak_separation_point(lmo::Hypercube, direction::AbstractArray{T}, max_value) where {T} v = similar(direction, T) v .= 0 res = zero(T) From b2a2d9b41382498a04de714190f6d87d281b3ca5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Mon, 21 Aug 2023 11:12:59 +0200 Subject: [PATCH 08/16] started pairwise --- src/pairwise.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index 05c393deb..4f0a568f7 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -30,6 +30,7 @@ function blended_pairwise_conditional_gradient( add_dropped_vertices=false, use_extra_vertex_storage=false, recompute_last_vertex=true, + weak_separation=false, ) # add the first vertex to active set from initialization active_set = ActiveSet([(1.0, x0)]) @@ -58,6 +59,7 @@ function blended_pairwise_conditional_gradient( add_dropped_vertices=add_dropped_vertices, use_extra_vertex_storage=use_extra_vertex_storage, recompute_last_vertex=recompute_last_vertex, + weak_separation=weak_separation, ) end @@ -90,6 +92,7 @@ function blended_pairwise_conditional_gradient( add_dropped_vertices=false, use_extra_vertex_storage=false, recompute_last_vertex=true, + weak_separation=false, ) # format string for output of the algorithm @@ -137,7 +140,7 @@ function blended_pairwise_conditional_gradient( "MEMORY_MODE: $memory_mode STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $NumType", ) grad_type = typeof(gradient) - println("GRADIENTTYPE: $grad_type LAZY: $lazy lazy_tolerance: $lazy_tolerance") + println("GRADIENTTYPE: $grad_type LAZY: $lazy lazy_tolerance: $lazy_tolerance WEAK_SEPARATION: $weak_separation") println("Linear Minimization Oracle: $(typeof(lmo))") if use_extra_vertex_storage && !lazy @info("vertex storage only used in lazy mode") From 2deff33db41ee1daf92ccd6b84df4eeea4fe9c76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Thu, 31 Aug 2023 16:46:15 +0200 Subject: [PATCH 09/16] continue weak --- src/afw.jl | 6 +++--- src/pairwise.jl | 29 +++++++++++++++++++++-------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/afw.jl b/src/afw.jl index 59fe67982..68bddb1fe 100644 --- a/src/afw.jl +++ b/src/afw.jl @@ -440,8 +440,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_ away_step_taken = false fw_step_taken = true index = -1 - #Lower our expectation for progress. - else + else # lower our expectation for progress. tt = dualstep phi = min(dual_gap, phi / 2.0) gamma_max = zero(a_lambda) @@ -471,7 +470,7 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE else (compute_extreme_point(lmo, gradient), 0.0) end - dual_gap = grad_dot_x - fast_dot(v, gradient) + gap + dual_gap = grad_dot_x - fast_dot(v, gradient) - gap if dual_gap > away_gap && dual_gap >= epsilon tt = gap == 0.0 ? regular : weaksep gamma_max = one(a_lambda) @@ -489,6 +488,7 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE fw_step_taken = false index = a_loc else + @assert gap == 0.0 tt = away gamma_max = zero(a_lambda) vertex = a diff --git a/src/pairwise.jl b/src/pairwise.jl index 4f0a568f7..fd9a5b089 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -277,27 +277,39 @@ function blended_pairwise_conditional_gradient( v = new_forward_vertex tt = lazylazy else - v = compute_extreme_point(lmo, gradient) - tt = regular + (v, gap) = if weak_separation + compute_weak_separation_point(lmo, gradient, lazy_threshold) + else + (compute_extreme_point(lmo, gradient), 0.0) + end + tt = gap == 0.0 ? regular : weaksep end else # for t == 1, v is already computed before first iteration - if t > 1 - v = compute_extreme_point(lmo, gradient) + if t == 1 + gap = 0.0 + else + (v, gap) = if weak_separation + lazy_threshold = fast_dot(gradient, x) - phi / lazy_tolerance + (v, gap) = compute_weak_separation_point(lmo, gradient, lazy_threshold) + else + v = compute_extreme_point(lmo, gradient) + gap = 0.0 + end end - tt = regular + tt = gap == 0.0 ? regular : weaksep end end vertex_taken = v dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v) # if we are about to exit, compute dual_gap with the cleaned-up x - if dual_gap ≤ epsilon + if dual_gap + gap ≤ epsilon active_set_renormalize!(active_set) active_set_cleanup!(active_set) compute_active_set_iterate!(active_set) x = get_active_set_iterate(active_set) grad!(gradient, x) - dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v) + dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v) + gap end # Note: In the following, we differentiate between lazy and non-lazy updates. # The reason is that the non-lazy version does not use phi but the lazy one heavily depends on it. @@ -365,7 +377,8 @@ function blended_pairwise_conditional_gradient( # that is ok as we scale with the K = 2.0 default anyways # we only update the dual gap if the step was regular (not lazy from discarded set) if tt != lazylazy - phi = dual_gap + @assert dual_gap + gap < phi + phi = dual_gap + gap @debug begin @assert tt == regular v2 = compute_extreme_point(lmo, gradient) From 05ebc9db68ba4f220025e4705c178edc963a7468 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Thu, 31 Aug 2023 18:10:18 +0200 Subject: [PATCH 10/16] gap not needed for stopping --- src/pairwise.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index fd9a5b089..8cda89d7e 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -291,10 +291,11 @@ function blended_pairwise_conditional_gradient( else (v, gap) = if weak_separation lazy_threshold = fast_dot(gradient, x) - phi / lazy_tolerance - (v, gap) = compute_weak_separation_point(lmo, gradient, lazy_threshold) + compute_weak_separation_point(lmo, gradient, lazy_threshold) else v = compute_extreme_point(lmo, gradient) gap = 0.0 + (v, gap) end end tt = gap == 0.0 ? regular : weaksep @@ -303,13 +304,13 @@ function blended_pairwise_conditional_gradient( vertex_taken = v dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v) # if we are about to exit, compute dual_gap with the cleaned-up x - if dual_gap + gap ≤ epsilon + if dual_gap ≤ epsilon active_set_renormalize!(active_set) active_set_cleanup!(active_set) compute_active_set_iterate!(active_set) x = get_active_set_iterate(active_set) grad!(gradient, x) - dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v) + gap + dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v) end # Note: In the following, we differentiate between lazy and non-lazy updates. # The reason is that the non-lazy version does not use phi but the lazy one heavily depends on it. @@ -320,7 +321,7 @@ function blended_pairwise_conditional_gradient( # - for lazy: we also accept slightly weaker vertices, those satisfying phi / lazy_tolerance # this should simplify the criterion. # DO NOT CHANGE without good reason and talk to Sebastian first for the logic behind this. - if !lazy || dual_gap ≥ phi / lazy_tolerance + if !lazy || dual_gap ≥ phi / lazy_tolerance d = muladd_memory_mode(memory_mode, d, x, v) gamma = perform_line_search( From 192286bfaf59c750fb1e54c6667815189219b14b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Thu, 31 Aug 2023 23:22:13 +0200 Subject: [PATCH 11/16] fix afw --- src/afw.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/afw.jl b/src/afw.jl index 68bddb1fe..eecc59be4 100644 --- a/src/afw.jl +++ b/src/afw.jl @@ -470,7 +470,7 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE else (compute_extreme_point(lmo, gradient), 0.0) end - dual_gap = grad_dot_x - fast_dot(v, gradient) - gap + dual_gap = grad_dot_x - fast_dot(v, gradient) if dual_gap > away_gap && dual_gap >= epsilon tt = gap == 0.0 ? regular : weaksep gamma_max = one(a_lambda) @@ -488,7 +488,6 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE fw_step_taken = false index = a_loc else - @assert gap == 0.0 tt = away gamma_max = zero(a_lambda) vertex = a @@ -496,7 +495,7 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE fw_step_taken = false index = a_loc end - return d, vertex, index, gamma_max, dual_gap, away_step_taken, fw_step_taken, tt + return d, vertex, index, gamma_max, dual_gap + gap, away_step_taken, fw_step_taken, tt end function fw_step(x, gradient, lmo, d; memory_mode::MemoryEmphasis = InplaceEmphasis()) From 71910a41245feda09c3f8e1cf090b39fb4f4c145 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Mon, 4 Sep 2023 13:01:48 +0200 Subject: [PATCH 12/16] finished implementation --- src/afw.jl | 14 ++++++++------ src/pairwise.jl | 2 +- test/weak_separation.jl | 9 ++++++++- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/afw.jl b/src/afw.jl index eecc59be4..bb635c7a3 100644 --- a/src/afw.jl +++ b/src/afw.jl @@ -423,16 +423,17 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_ # compute new vertex with normal or weak oracle if weak_separation lazy_threshold = fast_dot(gradient, x) - phi / lazy_tolerance - (v, _) = compute_weak_separation_point(lmo, gradient, lazy_threshold) - tt = weaksep + (v, gap) = compute_weak_separation_point(lmo, gradient, lazy_threshold) + tt = gap == 0.0 ? regular : weaksep else v = compute_extreme_point(lmo, gradient) + gap = zero(eltype(v)) tt = regular end end - # Real dual gap promises enough progress. grad_dot_fw_vertex = fast_dot(v, gradient) dual_gap = grad_dot_x - grad_dot_fw_vertex + # Real dual gap promises enough progress. if dual_gap >= phi / lazy_tolerance gamma_max = one(a_lambda) d = muladd_memory_mode(memory_mode, d, x, v) @@ -441,6 +442,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_ fw_step_taken = true index = -1 else # lower our expectation for progress. + @assert tt != weaksep tt = dualstep phi = min(dual_gap, phi / 2.0) gamma_max = zero(a_lambda) @@ -460,15 +462,15 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE away_gap = fast_dot(a, gradient) - grad_dot_x (v, gap) = if weak_separation # Condition for taking a FW step - # ⟨∇f, x-v⟩ ≥ gₐ + # ⟨∇f, x-v⟩ ≥ gₐ <=> # ⟨∇f, v⟩ ≤ ⟨∇f, x⟩ - gₐ - # We ask for a bit more on the FW step + # We ask for a bit more progress on the FW step # to promote away steps when we can (and therefore sparsity) # ⟨∇f, v⟩ ≤ ⟨∇f, x⟩ - K gₐ lazy_threshold = grad_dot_x - lazy_tolerance * away_gap compute_weak_separation_point(lmo, gradient, lazy_threshold) else - (compute_extreme_point(lmo, gradient), 0.0) + (compute_extreme_point(lmo, gradient), zero(away_gap)) end dual_gap = grad_dot_x - fast_dot(v, gradient) if dual_gap > away_gap && dual_gap >= epsilon diff --git a/src/pairwise.jl b/src/pairwise.jl index 8cda89d7e..1b9d0a9f9 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -376,7 +376,7 @@ function blended_pairwise_conditional_gradient( else # dual step # set to computed dual_gap for consistency between the lazy and non-lazy run. # that is ok as we scale with the K = 2.0 default anyways - # we only update the dual gap if the step was regular (not lazy from discarded set) + # we only update the dual gap if the step was regular or weaksep (not lazy from discarded set) if tt != lazylazy @assert dual_gap + gap < phi phi = dual_gap + gap diff --git a/test/weak_separation.jl b/test/weak_separation.jl index d21dd49f2..02f324b18 100644 --- a/test/weak_separation.jl +++ b/test/weak_separation.jl @@ -71,7 +71,7 @@ end @test w == v end -@testset "AFW with weak separation" begin +@testset "AFW and BPCG with weak separation" begin n = 1000 # reference point to get an optimum on a face ref_point = [0.6 + mod(idx, 2) for idx in 1:n] @@ -88,5 +88,12 @@ end tracking_weak = FrankWolfe.TrackingLMO(Hypercube()) x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy) @test tracking_lmo.counter <= tracking_weak.counter + + tracking_lmo = FrankWolfe.TrackingLMO(Hypercube()) + x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy) + tracking_weak = FrankWolfe.TrackingLMO(Hypercube()) + x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy) + @test tracking_lmo.counter <= tracking_weak.counter + end end From b94e95dca2473956fc33dea172f47925a507d7e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Tue, 5 Sep 2023 14:03:42 +0200 Subject: [PATCH 13/16] non-lazy BPCG --- src/pairwise.jl | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index 1b9d0a9f9..71cebc739 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -204,14 +204,24 @@ function blended_pairwise_conditional_gradient( local_gap = dot_away_vertex - dot_forward_vertex if !lazy if t > 1 - v = compute_extreme_point(lmo, gradient) - dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v) - phi = dual_gap + dot_x = fast_dot(gradient, x) + (v, weak_gap) = if weak_separation + # we need a separation point v + # ⟨∇f(x), x-v⟩ ≥ local_gap * lazy_threshold + # ⟨∇f(x), v⟩ ≤ ⟨∇f(x), x⟩ - local_gap * lazy_threshold + threshold = dot_x - local_gap * lazy_threshold + compute_weak_separation_point(lmo, gradient, threshold) + else + v = compute_extreme_point(lmo, gradient) + (v, zero(phi)) + end + dual_gap = dot_x - fast_dot(gradient, v) + phi = dual_gap + weak_gap end end # minor modification from original paper for improved sparsity # (proof follows with minor modification when estimating the step) - if local_gap ≥ phi / lazy_tolerance + if local_gap ≥ phi / lazy_tolerance # pairwise step d = muladd_memory_mode(memory_mode, d, a, v_local) vertex_taken = v_local gamma_max = a_lambda From dbb6622bd55c81f68d2b1d0bccad5ac1d355f2ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Tue, 5 Sep 2023 14:18:22 +0200 Subject: [PATCH 14/16] var name --- src/pairwise.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index 71cebc739..4660477cb 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -207,9 +207,9 @@ function blended_pairwise_conditional_gradient( dot_x = fast_dot(gradient, x) (v, weak_gap) = if weak_separation # we need a separation point v - # ⟨∇f(x), x-v⟩ ≥ local_gap * lazy_threshold - # ⟨∇f(x), v⟩ ≤ ⟨∇f(x), x⟩ - local_gap * lazy_threshold - threshold = dot_x - local_gap * lazy_threshold + # ⟨∇f(x), x-v⟩ ≥ local_gap * lazy_tolerance + # ⟨∇f(x), v⟩ ≤ ⟨∇f(x), x⟩ - local_gap * lazy_tolerance + threshold = dot_x - local_gap * lazy_tolerance compute_weak_separation_point(lmo, gradient, threshold) else v = compute_extreme_point(lmo, gradient) From cb729b3a2ecd0b1629d955af7bbb6d229184b9b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Tue, 5 Sep 2023 16:04:31 +0200 Subject: [PATCH 15/16] higher cardinality optimum --- test/weak_separation.jl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/weak_separation.jl b/test/weak_separation.jl index 02f324b18..41f97becc 100644 --- a/test/weak_separation.jl +++ b/test/weak_separation.jl @@ -73,8 +73,8 @@ end @testset "AFW and BPCG with weak separation" begin n = 1000 - # reference point to get an optimum on a face - ref_point = [0.6 + mod(idx, 2) for idx in 1:n] + # reference point to get an optimum with many vertices + ref_point = [0.4 + idx / n * mod(idx, 2) for idx in 1:n] f(x) = 1/2 / n * sum((x[i] - ref_point[i])^2 for i in eachindex(x)) function grad!(storage, x) storage .= x @@ -87,13 +87,12 @@ end x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.away_frank_wolfe(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy) tracking_weak = FrankWolfe.TrackingLMO(Hypercube()) x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy) - @test tracking_lmo.counter <= tracking_weak.counter + @test tracking_lmo.counter < tracking_weak.counter tracking_lmo = FrankWolfe.TrackingLMO(Hypercube()) x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy) tracking_weak = FrankWolfe.TrackingLMO(Hypercube()) x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy) - @test tracking_lmo.counter <= tracking_weak.counter - + @test tracking_lmo.counter < tracking_weak.counter end end From fb6773abfc048f2c727582e87c09ac604749363a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Sun, 10 Sep 2023 19:35:57 +0200 Subject: [PATCH 16/16] weak inner in lazified FW --- src/abstract_oracles.jl | 25 ++++++++++++++++++------- src/fw_algorithms.jl | 23 ++++++++++++----------- test/weak_separation.jl | 19 +++++++++++++++++++ 3 files changed, 49 insertions(+), 18 deletions(-) diff --git a/src/abstract_oracles.jl b/src/abstract_oracles.jl index ad92a5f12..de7f187c9 100644 --- a/src/abstract_oracles.jl +++ b/src/abstract_oracles.jl @@ -231,14 +231,15 @@ mutable struct VectorCacheLMO{LMO<:LinearMinimizationOracle,VT} <: inner::LMO store_cache::Bool greedy::Bool + weak_separation::Bool end function VectorCacheLMO{LMO,VT}(lmo::LMO) where {VT,LMO<:LinearMinimizationOracle} - return VectorCacheLMO{LMO,VT}(VT[], lmo, true, false) + return VectorCacheLMO{LMO,VT}(VT[], lmo, true, false, false) end function VectorCacheLMO(lmo::LMO) where {LMO<:LinearMinimizationOracle} - return VectorCacheLMO{LMO,Vector{Float64}}(AbstractVector[], lmo, true, false) + return VectorCacheLMO{LMO,Vector{Float64}}(AbstractVector[], lmo, true, false, false) end function Base.empty!(lmo::VectorCacheLMO) @@ -250,12 +251,17 @@ Base.length(lmo::VectorCacheLMO) = length(lmo.vertices) function compute_weak_separation_point(lmo::VectorCacheLMO, direction, max_value; kwargs...) if isempty(lmo.vertices) - v = compute_extreme_point(lmo.inner, direction) + v, gap = if lmo.weak_separation + compute_weak_separation_point(lmo.inner, direction, max_value; kwargs...) + else + v = compute_extreme_point(lmo.inner, direction; kwargs...) + v, zero(eltype(v)) + end T = promote_type(eltype(v), eltype(direction)) if lmo.store_cache push!(lmo.vertices, v) end - return v, zero(T) + return v, T(gap) end best_idx = -1 best_val = Inf @@ -281,8 +287,13 @@ function compute_weak_separation_point(lmo::VectorCacheLMO, direction, max_value T = promote_type(eltype(best_v), eltype(direction)) return best_v, T(Inf) end - # no satisfactory vertex found - v = compute_extreme_point(lmo.inner, direction) + # no satisfactory vertex found, call oracle + v, gap = if lmo.weak_separation + compute_weak_separation_point(lmo.inner, direction, max_value; kwargs...) + else + v = compute_extreme_point(lmo.inner, direction; kwargs...) + v, zero(eltype(v)) + end if lmo.store_cache # note: we do not check for duplicates. hence you might end up with more vertices, # in fact up to number of dual steps many, that might be already in the cache @@ -294,7 +305,7 @@ function compute_weak_separation_point(lmo::VectorCacheLMO, direction, max_value push!(lmo.vertices, v) end T = promote_type(eltype(v), eltype(direction)) - return v, zero(T) + return v, T(gap) end function compute_extreme_point( diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index 4102cd2d8..814c44473 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -273,6 +273,7 @@ function lazified_conditional_gradient( VType=typeof(x0), timeout=Inf, linesearch_workspace=nothing, + weak_separation=false, ) # format string for output of the algorithm @@ -297,6 +298,7 @@ function lazified_conditional_gradient( if isfinite(cache_size) Base.sizehint!(lmo.vertices, cache_size) end + lmo.weak_separation = weak_separation t = 0 dual_gap = Inf @@ -351,7 +353,7 @@ function lazified_conditional_gradient( linesearch_workspace = build_linesearch_workspace(line_search, x, gradient) end - while t <= max_iteration && dual_gap >= max(epsilon, eps(float(eltype(x)))) + while t <= max_iteration && phi >= max(epsilon, eps(float(eltype(x)))) ##################### # managing time @@ -383,18 +385,17 @@ function lazified_conditional_gradient( primal = f(x) end - v, gap = compute_weak_separation_point(lmo, gradient, threshold) - if gap > 0 - tt = lazy - if fast_dot(v, gradient) > threshold - tt = dualstep - dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - phi = min(dual_gap, phi / 2) - end + v, weak_gap = compute_weak_separation_point(lmo, gradient, threshold) + tt = lazy + if fast_dot(v, gradient) > threshold + tt = dualstep + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + @assert weak_gap == 0 + phi = min(dual_gap, phi / 2) else - tt = regular + tt = weak_gap > 0 ? weaksep : regular dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - phi = dual_gap + phi = min(phi, dual_gap + weak_gap) end d = muladd_memory_mode(memory_mode, d, x, v) diff --git a/test/weak_separation.jl b/test/weak_separation.jl index 41f97becc..b7c214f9d 100644 --- a/test/weak_separation.jl +++ b/test/weak_separation.jl @@ -96,3 +96,22 @@ end @test tracking_lmo.counter < tracking_weak.counter end end + +@testset "Lazy-FW with weak separation" begin + n = 1000 + # reference point to get an optimum with many vertices + ref_point = [0.4 + idx / n * mod(idx, 2) for idx in 1:n] + f(x) = 1/2 / n * sum((x[i] - ref_point[i])^2 for i in eachindex(x)) + function grad!(storage, x) + storage .= x + storage .-= ref_point + storage ./= n + end + x0 = FrankWolfe.compute_extreme_point(Hypercube(), -ones(n)) + tracking_lmo = FrankWolfe.TrackingLMO(Hypercube()) + x, v, primal, dual_gap, trajectory_exact = FrankWolfe.lazified_conditional_gradient(f, grad!, tracking_lmo, x0, verbose=true, weak_separation=false) + x0 = FrankWolfe.compute_extreme_point(Hypercube(), -ones(n)) + tracking_weak = FrankWolfe.TrackingLMO(Hypercube()) + x, v, primal, dual_gap, trajectory_weak = FrankWolfe.lazified_conditional_gradient(f, grad!, tracking_weak, x0, verbose=true, weak_separation=true) + @test tracking_lmo.counter < tracking_weak.counter +end