diff --git a/docs/src/reference/2_lmo.md b/docs/src/reference/2_lmo.md index b75d936dd..72459b9a2 100644 --- a/docs/src/reference/2_lmo.md +++ b/docs/src/reference/2_lmo.md @@ -5,7 +5,7 @@ The Linear Minimization Oracle (LMO) is a key component called at each iteration v\in \argmin_{x\in \mathcal{C}} \langle d,x \rangle. ``` -See [Combettes, Pokutta 2021](https://arxiv.org/abs/2101.10040) for references on most LMOs +See [Combettes, Pokutta 2021](https://arxiv.org/abs/2101.10040) for references on essential LMOs implemented in the package and their comparison with projection operators. ## Interface and wrappers @@ -19,6 +19,13 @@ All of them are subtypes of [`FrankWolfe.LinearMinimizationOracle`](@ref) and im compute_extreme_point ``` +Optionally, an LMO can implement a weak separation procedure based either on a heuristic or on an approximation algorithm: +```@docs +compute_weak_separation_point +``` + +Weak separation procedures will be used in the methods using an active set and lazified variants only. + We also provide some meta-LMOs wrapping another one with extended behavior: ```@docs FrankWolfe.CachedLinearMinimizationOracle diff --git a/src/abstract_oracles.jl b/src/abstract_oracles.jl index 2fe2f9e9e..de7f187c9 100644 --- a/src/abstract_oracles.jl +++ b/src/abstract_oracles.jl @@ -17,6 +17,29 @@ All LMOs should accept keyword arguments that they can ignore. """ function compute_extreme_point end +""" + compute_weak_separation_point(lmo, direction, max_value) -> (vertex, gap) + +Weak separation algorithm for a given oracle. +Unlike `compute_extreme_point`, `compute_weak_separation_point` may provide a suboptimal `vertex` with the following conditions: +- `vertex` is still a valid extreme point of the polytope. +IF an inexact vertex is computed: +- `⟨v, d⟩ ≤ max_value`, the pre-specified required improvement. +- `⟨v, d⟩ ≤ ⟨v_opt, d⟩ + gap`, with `v_opt` a vertex computed with the exact oracle. +- If the algorithm used to compute the inexact vertex provides a bound on optimality, the `gap` value must be valid. +- Otherwise, e.g. if the vertex is computed with a heuristic, `gap = ∞`. +ELSE (the oracle computes an optimal vertex): +- `⟨v, d⟩` may be greater than `max_value`, `gap` must be 0. +""" +function compute_weak_separation_point(lmo, direction, max_value; kwargs...) end + +# default to computing an exact vertex. +function compute_weak_separation_point(lmo::LinearMinimizationOracle, direction, max_value; kwargs...) + v = compute_extreme_point(lmo, direction; kwargs...) + gap = zero(eltype(v)) * zero(eltype(direction)) + return v, gap +end + """ CachedLinearMinimizationOracle{LMO} @@ -43,11 +66,29 @@ Vertices of `LMO` have to be of type `VT` if provided. mutable struct SingleLastCachedLMO{LMO,A} <: CachedLinearMinimizationOracle{LMO} last_vertex::Union{Nothing,A} inner::LMO + store_cache::Bool end # initializes with no cache by default SingleLastCachedLMO(lmo::LMO) where {LMO<:LinearMinimizationOracle} = - SingleLastCachedLMO{LMO,AbstractVector}(nothing, lmo) + SingleLastCachedLMO{LMO,AbstractVector}(nothing, lmo, true) + +# gap is 0 if exact, ∞ if cached point +function compute_weak_separation_point(lmo::SingleLastCachedLMO, direction, max_value; kwargs...) + if lmo.last_vertex !== nothing && isfinite(max_value) + # cache is a sufficiently-decreasing direction + if fast_dot(lmo.last_vertex, direction) ≤ max_value + T = promote_type(eltype(lmo.last_vertex), eltype(direction)) + return lmo.last_vertex, T(Inf) + end + end + v = compute_extreme_point(lmo.inner, direction, kwargs...) + if lmo.store_cache + lmo.last_vertex = v + end + T = promote_type(eltype(v), eltype(direction)) + return v, zero(T) +end function compute_extreme_point( lmo::SingleLastCachedLMO, @@ -62,7 +103,7 @@ function compute_extreme_point( return lmo.last_vertex end end - v = compute_extreme_point(lmo.inner, direction, kwargs...) + v = compute_extreme_point(lmo.inner, direction, v=v, kwargs...) if store_cache lmo.last_vertex = v end @@ -188,14 +229,17 @@ mutable struct VectorCacheLMO{LMO<:LinearMinimizationOracle,VT} <: CachedLinearMinimizationOracle{LMO} vertices::Vector{VT} inner::LMO + store_cache::Bool + greedy::Bool + weak_separation::Bool end function VectorCacheLMO{LMO,VT}(lmo::LMO) where {VT,LMO<:LinearMinimizationOracle} - return VectorCacheLMO{LMO,VT}(VT[], lmo) + return VectorCacheLMO{LMO,VT}(VT[], lmo, true, false, false) end function VectorCacheLMO(lmo::LMO) where {LMO<:LinearMinimizationOracle} - return VectorCacheLMO{LMO,Vector{Float64}}(AbstractVector[], lmo) + return VectorCacheLMO{LMO,Vector{Float64}}(AbstractVector[], lmo, true, false, false) end function Base.empty!(lmo::VectorCacheLMO) @@ -205,6 +249,65 @@ end Base.length(lmo::VectorCacheLMO) = length(lmo.vertices) +function compute_weak_separation_point(lmo::VectorCacheLMO, direction, max_value; kwargs...) + if isempty(lmo.vertices) + v, gap = if lmo.weak_separation + compute_weak_separation_point(lmo.inner, direction, max_value; kwargs...) + else + v = compute_extreme_point(lmo.inner, direction; kwargs...) + v, zero(eltype(v)) + end + T = promote_type(eltype(v), eltype(direction)) + if lmo.store_cache + push!(lmo.vertices, v) + end + return v, T(gap) + end + best_idx = -1 + best_val = Inf + best_v = nothing + for idx in reverse(eachindex(lmo.vertices)) + @inbounds v = lmo.vertices[idx] + new_val = fast_dot(v, direction) + if new_val ≤ max_value + T = promote_type(eltype(v), eltype(direction)) + # stop and return + if lmo.greedy + return v, T(Inf) + end + # otherwise, compare to incumbent + if new_val < best_val + best_v = v + best_val = new_val + best_idx = idx + end + end + end + if best_idx > 0 + T = promote_type(eltype(best_v), eltype(direction)) + return best_v, T(Inf) + end + # no satisfactory vertex found, call oracle + v, gap = if lmo.weak_separation + compute_weak_separation_point(lmo.inner, direction, max_value; kwargs...) + else + v = compute_extreme_point(lmo.inner, direction; kwargs...) + v, zero(eltype(v)) + end + if lmo.store_cache + # note: we do not check for duplicates. hence you might end up with more vertices, + # in fact up to number of dual steps many, that might be already in the cache + # in order to reach this point, if v was already in the cache is must not meet the threshold (otherwise we would have returned it) + # and it is the best possible, hence we will perform a dual step on the outside. + # + # note: another possibility could be to test against that in the if statement but then you might end you recalculating the same vertex a few times. + # as such this might be a better tradeoff, i.e., to not check the set for duplicates and potentially accept #dual_steps many duplicates. + push!(lmo.vertices, v) + end + T = promote_type(eltype(v), eltype(direction)) + return v, T(gap) +end + function compute_extreme_point( lmo::VectorCacheLMO, direction; @@ -228,7 +331,7 @@ function compute_extreme_point( @inbounds v = lmo.vertices[idx] new_val = fast_dot(v, direction) if new_val ≤ threshold - # stop, store and return + # stop and return if greedy return v end diff --git a/src/afw.jl b/src/afw.jl index ee96824f5..e43c16e2c 100644 --- a/src/afw.jl +++ b/src/afw.jl @@ -33,6 +33,7 @@ function away_frank_wolfe( use_extra_vertex_storage=false, linesearch_workspace=nothing, recompute_last_vertex=true, + weak_separation=false, ) # add the first vertex to active set from initialization active_set = ActiveSet([(1.0, x0)]) @@ -64,6 +65,7 @@ function away_frank_wolfe( use_extra_vertex_storage=use_extra_vertex_storage, linesearch_workspace=linesearch_workspace, recompute_last_vertex=recompute_last_vertex, + weak_separation=weak_separation, ) end @@ -95,6 +97,7 @@ function away_frank_wolfe( use_extra_vertex_storage=false, linesearch_workspace=nothing, recompute_last_vertex=true, + weak_separation=false, ) # format string for output of the algorithm format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n" @@ -153,7 +156,7 @@ function away_frank_wolfe( ) grad_type = typeof(gradient) println( - "GRADIENTTYPE: $grad_type LAZY: $lazy lazy_tolerance: $lazy_tolerance MOMENTUM: $momentum AWAYSTEPS: $away_steps", + "GRADIENT TYPE: $grad_type LAZY: $lazy LAZY_TOLERANCE: $lazy_tolerance WEAK_SEPARATION: $weak_separation MOMENTUM: $momentum AWAYSTEPS: $away_steps", ) println("Linear Minimization Oracle: $(typeof(lmo))") if (use_extra_vertex_storage || add_dropped_vertices) && extra_vertex_storage === nothing @@ -223,10 +226,16 @@ function away_frank_wolfe( extra_vertex_storage=extra_vertex_storage, lazy_tolerance=lazy_tolerance, memory_mode=memory_mode, + weak_separation=weak_separation, ) else d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt = - afw_step(x, gradient, lmo, active_set, epsilon, d, memory_mode=memory_mode) + afw_step( + x, gradient, lmo, active_set, epsilon, d, + memory_mode=memory_mode, + weak_separation=weak_separation, + lazy_tolerance=lazy_tolerance, + ) end else d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt = @@ -248,7 +257,7 @@ function away_frank_wolfe( memory_mode, ) - gamma = min(gamma_max, gamma) + gamma = min(gamma_max, gamma) # cleanup and renormalize every x iterations. Only for the fw steps. renorm = mod(t, renorm_interval) == 0 if away_step_taken @@ -368,9 +377,9 @@ function away_frank_wolfe( return x, v, primal, dual_gap, traj_data, active_set end -function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_vertex_storage=false, extra_vertex_storage=nothing, lazy_tolerance=2.0, memory_mode::MemoryEmphasis=InplaceEmphasis()) +function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_vertex_storage=false, extra_vertex_storage=nothing, lazy_tolerance=2.0, memory_mode::MemoryEmphasis=InplaceEmphasis(), weak_separation::Bool=true) _, v, v_loc, _, a_lambda, a, a_loc, _, _ = active_set_argminmax(active_set, gradient) - #Do lazy FW step + # do lazy FW step grad_dot_lazy_fw_vertex = fast_dot(v, gradient) grad_dot_x = fast_dot(x, gradient) grad_dot_a = fast_dot(a, gradient) @@ -385,7 +394,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_ fw_step_taken = true index = v_loc else - #Do away step, as it promises enough progress. + # do away step, as it promises enough progress. if grad_dot_a - grad_dot_x > grad_dot_x - grad_dot_lazy_fw_vertex && grad_dot_a - grad_dot_x >= phi / lazy_tolerance tt = away @@ -395,7 +404,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_ away_step_taken = true fw_step_taken = false index = a_loc - #Resort to calling the LMO + # resort to calling the LMO else # optionally: try vertex storage if use_extra_vertex_storage @@ -406,17 +415,25 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_ @debug("Found acceptable lazy vertex in storage") v = new_forward_vertex tt = lazylazy + end + else + found_better_vertex = false + end + if !found_better_vertex + # compute new vertex with normal or weak oracle + if weak_separation + lazy_threshold = fast_dot(gradient, x) - phi / lazy_tolerance + (v, gap) = compute_weak_separation_point(lmo, gradient, lazy_threshold) + tt = gap == 0.0 ? regular : weaksep else v = compute_extreme_point(lmo, gradient) + gap = zero(eltype(v)) tt = regular end - else - v = compute_extreme_point(lmo, gradient) - tt = regular end - # Real dual gap promises enough progress. grad_dot_fw_vertex = fast_dot(v, gradient) dual_gap = grad_dot_x - grad_dot_fw_vertex + # Real dual gap promises enough progress. if dual_gap >= phi / lazy_tolerance gamma_max = one(a_lambda) d = muladd_memory_mode(memory_mode, d, x, v) @@ -424,8 +441,8 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_ away_step_taken = false fw_step_taken = true index = -1 - #Lower our expectation for progress. - else + else # lower our expectation for progress. + @assert tt != weaksep tt = dualstep phi = min(dual_gap, phi / 2.0) gamma_max = zero(a_lambda) @@ -439,14 +456,25 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_ return d, vertex, index, gamma_max, phi, away_step_taken, fw_step_taken, tt end -function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryEmphasis=InplaceEmphasis()) +function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryEmphasis=InplaceEmphasis(), weak_separation::Bool=false, lazy_tolerance=2.0) _, _, _, _, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient) - v = compute_extreme_point(lmo, gradient) grad_dot_x = fast_dot(x, gradient) away_gap = fast_dot(a, gradient) - grad_dot_x + (v, gap) = if weak_separation + # Condition for taking a FW step + # ⟨∇f, x-v⟩ ≥ gₐ <=> + # ⟨∇f, v⟩ ≤ ⟨∇f, x⟩ - gₐ + # We ask for a bit more progress on the FW step + # to promote away steps when we can (and therefore sparsity) + # ⟨∇f, v⟩ ≤ ⟨∇f, x⟩ - K gₐ + lazy_threshold = grad_dot_x - lazy_tolerance * away_gap + compute_weak_separation_point(lmo, gradient, lazy_threshold) + else + (compute_extreme_point(lmo, gradient), zero(away_gap)) + end dual_gap = grad_dot_x - fast_dot(v, gradient) - if dual_gap >= away_gap && dual_gap >= epsilon - tt = regular + if dual_gap > away_gap && dual_gap >= epsilon + tt = gap == 0.0 ? regular : weaksep gamma_max = one(a_lambda) d = muladd_memory_mode(memory_mode, d, x, v) vertex = v @@ -469,7 +497,7 @@ function afw_step(x, gradient, lmo, active_set, epsilon, d; memory_mode::MemoryE fw_step_taken = false index = a_loc end - return d, vertex, index, gamma_max, dual_gap, away_step_taken, fw_step_taken, tt + return d, vertex, index, gamma_max, dual_gap + gap, away_step_taken, fw_step_taken, tt end function fw_step(x, gradient, lmo, d; memory_mode::MemoryEmphasis = InplaceEmphasis()) diff --git a/src/defs.jl b/src/defs.jl index db3c8c0f1..c251b8181 100644 --- a/src/defs.jl +++ b/src/defs.jl @@ -19,6 +19,7 @@ struct OutplaceEmphasis <: MemoryEmphasis end away = 6 pairwise = 7 drop = 8 + weaksep = 9 simplex_descent = 101 gap_step = 102 last = 1000 @@ -34,6 +35,7 @@ const st = ( away="A", pairwise="P", drop="D", + weaksep="W", simplex_descent="SD", gap_step="GS", last="Last", diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index daddb6145..814c44473 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -273,6 +273,7 @@ function lazified_conditional_gradient( VType=typeof(x0), timeout=Inf, linesearch_workspace=nothing, + weak_separation=false, ) # format string for output of the algorithm @@ -293,14 +294,16 @@ function lazified_conditional_gradient( end lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base) + lmo.greedy = greedy_lazy if isfinite(cache_size) Base.sizehint!(lmo.vertices, cache_size) end + lmo.weak_separation = weak_separation t = 0 dual_gap = Inf primal = Inf - v = [] + v = x0 x = x0 phi = Inf tt = regular @@ -350,10 +353,10 @@ function lazified_conditional_gradient( linesearch_workspace = build_linesearch_workspace(line_search, x, gradient) end - while t <= max_iteration && dual_gap >= max(epsilon, eps(float(eltype(x)))) + while t <= max_iteration && phi >= max(epsilon, eps(float(eltype(x)))) ##################### - # managing time and Ctrl-C + # managing time ##################### time_at_loop = time_ns() if t == 0 @@ -382,12 +385,17 @@ function lazified_conditional_gradient( primal = f(x) end - v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy) + v, weak_gap = compute_weak_separation_point(lmo, gradient, threshold) tt = lazy if fast_dot(v, gradient) > threshold tt = dualstep dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + @assert weak_gap == 0 phi = min(dual_gap, phi / 2) + else + tt = weak_gap > 0 ? weaksep : regular + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + phi = min(phi, dual_gap + weak_gap) end d = muladd_memory_mode(memory_mode, d, x, v) diff --git a/src/moi_oracle.jl b/src/moi_oracle.jl index 875b8001d..18014a225 100644 --- a/src/moi_oracle.jl +++ b/src/moi_oracle.jl @@ -128,7 +128,6 @@ function _optimize_and_return(lmo, variables) term_st = MOI.get(lmo.o, MOI.TerminationStatus()) if term_st ∉ (MOI.OPTIMAL, MOI.ALMOST_OPTIMAL, MOI.SLOW_PROGRESS) @error "Unexpected termination: $term_st" - return MOI.get.(lmo.o, MOI.VariablePrimal(), variables) end return MOI.get.(lmo.o, MOI.VariablePrimal(), variables) end diff --git a/src/pairwise.jl b/src/pairwise.jl index 1f401405b..16bd2fb80 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -30,6 +30,7 @@ function blended_pairwise_conditional_gradient( add_dropped_vertices=false, use_extra_vertex_storage=false, recompute_last_vertex=true, + weak_separation=false, ) # add the first vertex to active set from initialization active_set = ActiveSet([(1.0, x0)]) @@ -58,6 +59,7 @@ function blended_pairwise_conditional_gradient( add_dropped_vertices=add_dropped_vertices, use_extra_vertex_storage=use_extra_vertex_storage, recompute_last_vertex=recompute_last_vertex, + weak_separation=weak_separation, ) end @@ -90,6 +92,7 @@ function blended_pairwise_conditional_gradient( add_dropped_vertices=false, use_extra_vertex_storage=false, recompute_last_vertex=true, + weak_separation=false, ) # format string for output of the algorithm @@ -137,7 +140,7 @@ function blended_pairwise_conditional_gradient( "MEMORY_MODE: $memory_mode STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $NumType", ) grad_type = typeof(gradient) - println("GRADIENTTYPE: $grad_type LAZY: $lazy lazy_tolerance: $lazy_tolerance") + println("GRADIENTTYPE: $grad_type LAZY: $lazy lazy_tolerance: $lazy_tolerance WEAK_SEPARATION: $weak_separation") println("Linear Minimization Oracle: $(typeof(lmo))") if use_extra_vertex_storage && !lazy @info("vertex storage only used in lazy mode") @@ -201,14 +204,24 @@ function blended_pairwise_conditional_gradient( local_gap = dot_away_vertex - dot_forward_vertex if !lazy if t > 1 - v = compute_extreme_point(lmo, gradient) - dual_gap = fast_dot(gradient, x) - fast_dot(gradient, v) - phi = dual_gap + dot_x = fast_dot(gradient, x) + (v, weak_gap) = if weak_separation + # we need a separation point v + # ⟨∇f(x), x-v⟩ ≥ local_gap * lazy_tolerance + # ⟨∇f(x), v⟩ ≤ ⟨∇f(x), x⟩ - local_gap * lazy_tolerance + threshold = dot_x - local_gap * lazy_tolerance + compute_weak_separation_point(lmo, gradient, threshold) + else + v = compute_extreme_point(lmo, gradient) + (v, zero(phi)) + end + dual_gap = dot_x - fast_dot(gradient, v) + phi = dual_gap + weak_gap end end # minor modification from original paper for improved sparsity # (proof follows with minor modification when estimating the step) - if local_gap ≥ phi / lazy_tolerance + if local_gap ≥ phi / lazy_tolerance # pairwise step d = muladd_memory_mode(memory_mode, d, a, v_local) vertex_taken = v_local gamma_max = a_lambda @@ -274,15 +287,28 @@ function blended_pairwise_conditional_gradient( v = new_forward_vertex tt = lazylazy else - v = compute_extreme_point(lmo, gradient) - tt = regular + (v, gap) = if weak_separation + compute_weak_separation_point(lmo, gradient, lazy_threshold) + else + (compute_extreme_point(lmo, gradient), 0.0) + end + tt = gap == 0.0 ? regular : weaksep end else # for t == 1, v is already computed before first iteration - if t > 1 - v = compute_extreme_point(lmo, gradient) + if t == 1 + gap = 0.0 + else + (v, gap) = if weak_separation + lazy_threshold = fast_dot(gradient, x) - phi / lazy_tolerance + compute_weak_separation_point(lmo, gradient, lazy_threshold) + else + v = compute_extreme_point(lmo, gradient) + gap = 0.0 + (v, gap) + end end - tt = regular + tt = gap == 0.0 ? regular : weaksep end end vertex_taken = v @@ -305,7 +331,7 @@ function blended_pairwise_conditional_gradient( # - for lazy: we also accept slightly weaker vertices, those satisfying phi / lazy_tolerance # this should simplify the criterion. # DO NOT CHANGE without good reason and talk to Sebastian first for the logic behind this. - if !lazy || dual_gap ≥ phi / lazy_tolerance + if !lazy || dual_gap ≥ phi / lazy_tolerance d = muladd_memory_mode(memory_mode, d, x, v) gamma = perform_line_search( @@ -360,9 +386,10 @@ function blended_pairwise_conditional_gradient( else # dual step # set to computed dual_gap for consistency between the lazy and non-lazy run. # that is ok as we scale with the K = 2.0 default anyways - # we only update the dual gap if the step was regular (not lazy from discarded set) + # we only update the dual gap if the step was regular or weaksep (not lazy from discarded set) if tt != lazylazy - phi = dual_gap + @assert dual_gap + gap < phi + phi = dual_gap + gap @debug begin @assert tt == regular v2 = compute_extreme_point(lmo, gradient) diff --git a/src/tracking.jl b/src/tracking.jl index 949806fbf..205043cc1 100644 --- a/src/tracking.jl +++ b/src/tracking.jl @@ -55,7 +55,12 @@ end function compute_extreme_point(lmo::TrackingLMO, x; kwargs...) lmo.counter += 1 - return compute_extreme_point(lmo.lmo, x) + return compute_extreme_point(lmo.lmo, x; kwargs...) +end + +function compute_weak_separation_point(lmo::TrackingLMO, direction, max_value) + lmo.counter += 1 + return compute_weak_separation_point(lmo.lmo, direction, max_value) end is_tracking_lmo(lmo) = false diff --git a/test/runtests.jl b/test/runtests.jl index 1d0c9c4c9..fa9c9b29e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -785,3 +785,10 @@ include("generic-arrays.jl") end end end + +module WeakSeparation +using Test +@testset "weak separation oracles" begin + include("weak_separation.jl") +end +end diff --git a/test/weak_separation.jl b/test/weak_separation.jl new file mode 100644 index 000000000..b7c214f9d --- /dev/null +++ b/test/weak_separation.jl @@ -0,0 +1,117 @@ +using FrankWolfe +using LinearAlgebra +using Test + +""" +A test LMO representing the feasible set [0,1]^n. +`nops` tracks the number of operations performed. +""" +mutable struct Hypercube <: FrankWolfe.LinearMinimizationOracle + nops::Int +end + +Hypercube() = Hypercube(0) + +function FrankWolfe.compute_extreme_point(lmo::Hypercube, direction::AbstractArray{T}) where {T} + v = similar(direction, T) + v .= 0 + for idx in eachindex(v) + if direction[idx] < 0 + v[idx] = 1 + end + end + lmo.nops += length(v) + return v +end + +# a very naive implementation of a weak separation oracle without a gap (a heuristic) +# we stop iterating through the indices whenever we obtain sufficient progress +function FrankWolfe.compute_weak_separation_point(lmo::Hypercube, direction::AbstractArray{T}, max_value) where {T} + v = similar(direction, T) + v .= 0 + res = zero(T) + gap = zero(T) + nops = 0 + for idx in eachindex(v) + nops += 1 + if direction[idx] < 0 + v[idx] = 1 + res += direction[idx] + end + if res <= max_value && idx != lastindex(v) + gap = T(Inf) + break + end + end + lmo.nops += nops + return v, gap +end + +@testset "Basic behaviour" begin + lmo = Hypercube() + direction = rand(4,2) + v = FrankWolfe.compute_extreme_point(lmo, direction) + @test size(v) == size(direction) + @test norm(v) == 0 + (w, _) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1) + @test norm(w) == 0 + direction .*= -1 + direction[end] -= 1 + v = FrankWolfe.compute_extreme_point(lmo, direction) + @test v == ones(size(direction)) + (w, gap0) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1) + @test dot(w, direction) <= -1 + @test gap0 == Inf + (w, gap1) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1.5) + @test dot(w, direction) <= -1.5 + @test gap1 == Inf + # asking for too much progress results in exact LMO call + (w, gap3) = FrankWolfe.compute_weak_separation_point(lmo, direction, -1000) + @test gap3 == 0.0 + @test w == v +end + +@testset "AFW and BPCG with weak separation" begin + n = 1000 + # reference point to get an optimum with many vertices + ref_point = [0.4 + idx / n * mod(idx, 2) for idx in 1:n] + f(x) = 1/2 / n * sum((x[i] - ref_point[i])^2 for i in eachindex(x)) + function grad!(storage, x) + storage .= x + storage .-= ref_point + storage ./= n + end + x0 = FrankWolfe.compute_extreme_point(Hypercube(), -ones(n)) + for lazy in (false, true) + tracking_lmo = FrankWolfe.TrackingLMO(Hypercube()) + x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.away_frank_wolfe(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy) + tracking_weak = FrankWolfe.TrackingLMO(Hypercube()) + x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.away_frank_wolfe(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy) + @test tracking_lmo.counter < tracking_weak.counter + + tracking_lmo = FrankWolfe.TrackingLMO(Hypercube()) + x, v, primal, dual_gap, trajectory_exact, active_set_exact = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, tracking_lmo, x0, verbose=false, weak_separation=false, lazy=lazy) + tracking_weak = FrankWolfe.TrackingLMO(Hypercube()) + x, v, primal, dual_gap, trajectory_weak, active_set_weak = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, tracking_weak, x0, verbose=false, weak_separation=true, lazy=lazy) + @test tracking_lmo.counter < tracking_weak.counter + end +end + +@testset "Lazy-FW with weak separation" begin + n = 1000 + # reference point to get an optimum with many vertices + ref_point = [0.4 + idx / n * mod(idx, 2) for idx in 1:n] + f(x) = 1/2 / n * sum((x[i] - ref_point[i])^2 for i in eachindex(x)) + function grad!(storage, x) + storage .= x + storage .-= ref_point + storage ./= n + end + x0 = FrankWolfe.compute_extreme_point(Hypercube(), -ones(n)) + tracking_lmo = FrankWolfe.TrackingLMO(Hypercube()) + x, v, primal, dual_gap, trajectory_exact = FrankWolfe.lazified_conditional_gradient(f, grad!, tracking_lmo, x0, verbose=true, weak_separation=false) + x0 = FrankWolfe.compute_extreme_point(Hypercube(), -ones(n)) + tracking_weak = FrankWolfe.TrackingLMO(Hypercube()) + x, v, primal, dual_gap, trajectory_weak = FrankWolfe.lazified_conditional_gradient(f, grad!, tracking_weak, x0, verbose=true, weak_separation=true) + @test tracking_lmo.counter < tracking_weak.counter +end