diff --git a/src/FrankWolfe.jl b/src/FrankWolfe.jl index 0f5eee244..4c403870f 100644 --- a/src/FrankWolfe.jl +++ b/src/FrankWolfe.jl @@ -41,7 +41,7 @@ include("fw_algorithms.jl") # collecting most common data types etc and precompile # min version req set to 1.5 to prevent stalling of julia 1 -@static if VERSION >= v"1.5" +@static if VERSION >= v"1.5" println("Precompiling common signatures. This might take a moment...") include("precompile.jl") end diff --git a/src/afw.jl b/src/afw.jl index 2c4a84c58..c90f75f16 100644 --- a/src/afw.jl +++ b/src/afw.jl @@ -32,6 +32,7 @@ function away_frank_wolfe( callback=nothing, timeout=Inf, print_callback=print_callback, + kwargs..., ) # format string for output of the algorithm @@ -89,7 +90,7 @@ function away_frank_wolfe( x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) phi_value = max(0, fast_dot(x, gradient) - fast_dot(v, gradient)) gamma = 1.0 @@ -128,14 +129,14 @@ function away_frank_wolfe( if away_steps if lazy d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt = - lazy_afw_step(x, gradient, lmo, active_set, phi_value; K=K) + lazy_afw_step(x, gradient, lmo, active_set, phi_value; K=K, kwargs...) else d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt = - afw_step(x, gradient, lmo, active_set) + afw_step(x, gradient, lmo, active_set; kwargs...) end else d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt = - fw_step(x, gradient, lmo) + fw_step(x, gradient, lmo; kwargs...) end if fw_step_taken || away_step_taken @@ -215,7 +216,7 @@ function away_frank_wolfe( if verbose x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) tt = last @@ -237,7 +238,7 @@ function away_frank_wolfe( active_set_cleanup!(active_set) x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) if verbose @@ -260,9 +261,9 @@ function away_frank_wolfe( return x, v, primal, dual_gap, traj_data, active_set end -function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) +function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0, kwargs...) v_lambda, v, v_loc, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient) - #Do lazy FW step + # Do lazy FW step grad_dot_lazy_fw_vertex = fast_dot(v, gradient) grad_dot_x = fast_dot(x, gradient) grad_dot_a = fast_dot(a, gradient) @@ -276,7 +277,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) fw_step_taken = true index = v_loc else - #Do away step, as it promises enough progress. + # Do away step, as it promises enough progress. if grad_dot_a - grad_dot_x > grad_dot_x - grad_dot_lazy_fw_vertex && grad_dot_a - grad_dot_x >= phi / K tt = away @@ -286,9 +287,9 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) away_step_taken = true fw_step_taken = false index = a_loc - #Resort to calling the LMO + # Resort to calling the LMO else - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) # Real dual gap promises enough progress. grad_dot_fw_vertex = fast_dot(v, gradient) dual_gap = grad_dot_x - grad_dot_fw_vertex @@ -300,7 +301,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) away_step_taken = false fw_step_taken = true index = nothing - #Lower our expectation for progress. + # Lower our expectation for progress. else tt = dualstep phi = min(dual_gap, phi / 2.0) @@ -316,11 +317,11 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0) return d, vertex, index, gamma_max, phi, away_step_taken, fw_step_taken, tt end -function afw_step(x, gradient, lmo, active_set) +function afw_step(x, gradient, lmo, active_set; kwargs...) local_v_lambda, local_v, local_v_loc, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient) away_gap = fast_dot(a, gradient) - fast_dot(x, gradient) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) grad_dot_x = fast_dot(x, gradient) away_gap = fast_dot(a, gradient) - grad_dot_x dual_gap = grad_dot_x - fast_dot(v, gradient) @@ -344,8 +345,8 @@ function afw_step(x, gradient, lmo, active_set) return d, vertex, index, gamma_max, dual_gap, away_step_taken, fw_step_taken, tt end -function fw_step(x, gradient, lmo) - vertex = compute_extreme_point(lmo, gradient) +function fw_step(x, gradient, lmo; kwargs...) + vertex = compute_extreme_point(lmo, gradient, x=x; kwargs...) return ( x - vertex, vertex, diff --git a/src/blended_cg.jl b/src/blended_cg.jl index cd51de342..cf6547fc8 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -51,7 +51,7 @@ function blended_conditional_gradient( primal = f(x) grad!(gradient, x) # initial gap estimate computation - vmax = compute_extreme_point(lmo, gradient) + vmax = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...) phi = fast_dot(gradient, x0 - vmax) / 2 dual_gap = phi traj_data = [] @@ -243,7 +243,7 @@ function blended_conditional_gradient( if verbose x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) tot_time = (time_ns() - time_start) / 1e9 @@ -267,7 +267,7 @@ function blended_conditional_gradient( active_set_renormalize!(active_set) x = compute_active_set_iterate(active_set) grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...) primal = f(x) #dual_gap = 2phi dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) @@ -1025,7 +1025,7 @@ function lp_separation_oracle( end end # otherwise, call the LMO - y = compute_extreme_point(lmo, direction; kwargs...) + y = compute_extreme_point(lmo, direction; x=x, kwargs...) # don't return nothing but y, fast_dot(direction, y) / use y for step outside / and update phi as in LCG (lines 402 - 406) return (y, fast_dot(direction, y)) end diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index 5799356e7..d4740c8ec 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -31,6 +31,7 @@ function frank_wolfe( callback=nothing, timeout=Inf, print_callback=print_callback, + kwargs..., ) # format string for output of the algorithm @@ -131,7 +132,8 @@ function frank_wolfe( end first_iter = false - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) + #@show typeof(v) # go easy on the memory - only compute if really needed if ( (mod(t, print_iter) == 0 && verbose) || @@ -159,6 +161,9 @@ function frank_wolfe( step_lim, one(eltype(x)), ) + #@show typeof(x) + #@show typeof(d) + #@show typeof(x-gamma*d) if callback !== nothing state = ( t=t, @@ -201,7 +206,9 @@ function frank_wolfe( # hence the final computation. grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) + #@show v + #@show typeof(v) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) if verbose @@ -256,6 +263,7 @@ function lazified_conditional_gradient( callback=nothing, timeout=Inf, print_callback=print_callback, + kwargs..., ) # format string for output of the algorithm @@ -351,7 +359,14 @@ function lazified_conditional_gradient( primal = f(x) end - v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy) + v = compute_extreme_point( + lmo, + gradient, + threshold=threshold, + greedy=greedy_lazy, + x=x; + kwargs..., + ) tt = lazy if fast_dot(v, gradient) > threshold tt = dualstep @@ -418,7 +433,7 @@ function lazified_conditional_gradient( # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting # hence the final computation. grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) @@ -470,6 +485,7 @@ function stochastic_frank_wolfe( callback=nothing, timeout=Inf, print_callback=print_callback, + kwargs..., ) # format string for output of the algorithm @@ -568,7 +584,7 @@ function stochastic_frank_wolfe( end first_iter = false - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) # go easy on the memory - only compute if really needed if (mod(t, print_iter) == 0 && verbose) || @@ -632,7 +648,7 @@ function stochastic_frank_wolfe( # last computation done with full evaluation for exact gradient (primal, gradient) = compute_value_gradient(f, x, full_evaluation=true) - v = compute_extreme_point(lmo, gradient) + v = compute_extreme_point(lmo, gradient, x=x; kwargs...) # @show (gradient, primal) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) if verbose diff --git a/src/oracles.jl b/src/oracles.jl index bdcb387ca..58e59bc59 100644 --- a/src/oracles.jl +++ b/src/oracles.jl @@ -61,7 +61,7 @@ function compute_extreme_point( return v end end - v = compute_extreme_point(lmo.inner, direction, kwargs...) + v = compute_extreme_point(lmo.inner, direction; kwargs...) if store_cache lmo.last_vertex = v end @@ -158,14 +158,14 @@ function compute_extreme_point( end end end - if best_idx > 0 # && fast_dot(best_v, direction) ≤ threshold + if best_idx > 0 # && fast_dot(best_v, direction) ≤ threshold # println("cache sol") return best_v end end # no interesting point found, computing new # println("LP sol") - v = compute_extreme_point(lmo.inner, direction, kwargs...) + v = compute_extreme_point(lmo.inner, direction; kwargs...) if store_cache tup = Base.setindex(lmo.vertices, v, lmo.oldest_idx) lmo.vertices = tup @@ -241,10 +241,10 @@ function compute_extreme_point( if best_idx < 0 v = compute_extreme_point(lmo.inner, direction) if store_cache - # note: we do not check for duplicates. hence you might end up with more vertices, + # note: we do not check for duplicates. hence you might end up with more vertices, # in fact up to number of dual steps many, that might be already in the cache - # in order to reach this point, if v was already in the cache is must not meet the threshold (otherwise we would have returned it) - # and it is the best possible, hence we will perform a dual step on the outside. + # in order to reach this point, if v was already in the cache is must not meet the threshold (otherwise we would have returned it) + # and it is the best possible, hence we will perform a dual step on the outside. # # note: another possibility could be to test against that in the if statement but then you might end you recalculating the same vertex a few times. # as such this might be a better tradeoff, i.e., to not check the set for duplicates and potentially accept #dualSteps many duplicates. @@ -303,3 +303,116 @@ function compute_extreme_point( end return storage end + +""" + ChasingGradientLMO{LMO,T} + +Oracle boosting FW with gradient pursuit: the LMO constructs a direction as +a combination of multiple extreme vertices: +`d_t = ∑_k λ_k (v_k - x_t)` found in successive rounds. +Pursuit rounds terminate once the alignment improvement with the direction of the gradient +is lower than `improv_tol` or `max_rounds_number` is reached. + +The LMO also keeps internal containers `d`, `u`, `residual` and `m_residual` to reduce allocations. +See the [paper](https://arxiv.org/abs/2003.06369). +All keyword arguments to `compute_extreme_point` are passed to the inner LMO. +""" +mutable struct ChasingGradientLMO{LMO<:LinearMinimizationOracle,T,G,G2} <: LinearMinimizationOracle + inner::LMO + max_rounds_number::Int + improv_tol::T + d::G + u::G + residual::G2 + m_residual::G2 +end + +function ChasingGradientLMO(lmo, max_rounds_number, improv_tol, d, residual) + return ChasingGradientLMO( + lmo, + max_rounds_number, + improv_tol, + d, + copy(d), + residual, + copy(residual), + ) +end + +function _zero!(d) + @. d = 0 + return nothing +end + +function _zero!(d::SparseArrays.AbstractSparseArray) + @. d = 0 + SparseArrays.dropzeros!(d) + return nothing +end + +function _inplace_plus(d, v) + @. d += v +end + +function _inplace_plus(d, v::ScaledHotVector) + return d[v.val_idx] += v.active_val +end + +function compute_extreme_point(lmo::ChasingGradientLMO, direction; x, kwargs...) + _zero!(lmo.d) + v_ret = 0 * lmo.d + Λ = zero(eltype(direction)) + norm_direction = norm(direction) + if norm_direction <= eps(eltype(direction)) + v_ret .+= compute_extreme_point(lmo.inner, direction, x=x, kwargs...) + return v_ret + end + use_v = true + function align(d) + if norm(d) <= eps(eltype(d)) + return -one(eltype(d)) + else + return -fast_dot(direction, d) / (norm_direction * norm(d)) + end + end + @. lmo.residual = -direction + @. lmo.m_residual = direction + for round in 1:lmo.max_rounds_number + if norm(lmo.residual) <= eps(eltype(lmo.residual)) + break + end + v = compute_extreme_point(lmo.inner, lmo.m_residual; kwargs...) + @. lmo.u = -x + _inplace_plus(lmo.u, v) + d_norm = norm(lmo.d) + if d_norm > 0 && fast_dot(lmo.residual, lmo.d / d_norm) < -fast_dot(lmo.residual, lmo.u) + @. lmo.u = -lmo.d / d_norm + use_v = false + end + λ = fast_dot(lmo.residual, lmo.u) / (norm(lmo.u)^2) + d_updated = lmo.u + @. d_updated = lmo.d + λ * lmo.u #2 allocs + if align(d_updated) - align(lmo.d) >= lmo.improv_tol + @. lmo.d = d_updated + # @. lmo.residual = -direction - lmo.d + @. lmo.residual -= λ * lmo.u + @. lmo.m_residual = -lmo.residual + if use_v + Λ += λ + else + Λ = Λ * (1 - λ / d_norm) + end + use_v = true + else + #@show round + break + end + end + if Λ <= eps(eltype(Λ)) + v_ret .+= compute_extreme_point(lmo.inner, direction, x=x, kwargs...) + return v_ret + end + @. v_ret += x + @. v_ret += lmo.d / Λ #2 allocations + return v_ret +end diff --git a/src/simplex_oracles.jl b/src/simplex_oracles.jl index 34d5407cd..e1e96e411 100644 --- a/src/simplex_oracles.jl +++ b/src/simplex_oracles.jl @@ -21,7 +21,7 @@ LMO for scaled unit simplex: Returns either vector of zeros or vector with one active value equal to RHS if there exists an improving direction. """ -function compute_extreme_point(lmo::UnitSimplexOracle{T}, direction) where {T} +function compute_extreme_point(lmo::UnitSimplexOracle{T}, direction; kwargs...) where {T} idx = argmin(direction) if direction[idx] < 0 return ScaledHotVector(lmo.right_side, idx, length(direction)) diff --git a/test/lmo.jl b/test/lmo.jl index 4cbd92a14..5492bb050 100644 --- a/test/lmo.jl +++ b/test/lmo.jl @@ -1,7 +1,7 @@ using Test using FrankWolfe using LinearAlgebra -import SparseArrays +using SparseArrays import FrankWolfe: compute_extreme_point, LpNormLMO, KSparseLMO @@ -519,7 +519,7 @@ end end @testset "Product LMO" begin - # + # lmo = FrankWolfe.ProductLMO(FrankWolfe.LpNormLMO{Inf}(3.0), FrankWolfe.LpNormLMO{1}(2.0)) dinf = randn(10) d1 = randn(5) @@ -532,3 +532,53 @@ end vvec = FrankWolfe.compute_extreme_point(lmo, [dinf; d1]; direction_indices=(1:10, 11:15)) @test vvec ≈ [vinf; v1] end + + +@testset "Chasing Gradient LMO" begin + @testset "Consistent results with 1 round" begin + max_rounds = 1 + improv_tol = 1e-3 + n = Int(1e5) + xpi = rand(n) + total = sum(xpi) + xp = xpi ./ total + f(x) = norm(x - xp)^2 + function grad!(storage, x) + @. storage = 2 * (x - xp) + return nothing + end + d = spzeros(n) + residual = zeros(n) + for lmo_inner in [ + FrankWolfe.LpNormLMO{1}(1.0), + FrankWolfe.LpNormLMO{1}(1.0), + FrankWolfe.ProbabilitySimplexOracle(1.0), + ] + x00 = FrankWolfe.compute_extreme_point(lmo_inner, zeros(n)) + lmo = FrankWolfe.ChasingGradientLMO(lmo_inner, max_rounds, improv_tol, d, residual) + x0 = sparse(deepcopy(x00)) + res_boosting = FrankWolfe.frank_wolfe( + f, + grad!, + lmo, + x0, + max_iteration=500, + gradient=zeros(n), + line_search=FrankWolfe.Adaptive(), + verbose=true, + ) + x0 = deepcopy(x00) + res = FrankWolfe.frank_wolfe( + f, + grad!, + lmo_inner, + x0, + max_iteration=500, + line_search=FrankWolfe.Adaptive(), + verbose=true, + ) + @test isapprox(res_boosting[3], res[3]) + end + end + +end diff --git a/test/temp.jl b/test/temp.jl new file mode 100644 index 000000000..439d1a3a4 --- /dev/null +++ b/test/temp.jl @@ -0,0 +1,61 @@ + +using Test +using FrankWolfe +using LinearAlgebra + +using SparseArrays + +using ProfileView +using Random +using BenchmarkTools +import FrankWolfe: compute_extreme_point, LpNormLMO, KSparseLMO + +using Plots + + + +function test(lmo, direction, x0, xp) + for _ in 1:100 + FrankWolfe.compute_extreme_point(lmo, direction, x=x0) + end + # res_boosting = FrankWolfe.frank_wolfe( + # f, + # grad!, + # lmo, + # x0, + # gradient=zeros(n), + # max_iteration=500, + # print_iter=50, + # line_search=FrankWolfe.Adaptive(), + # verbose=true, + # ) +end + + +function bench() + max_rounds = 10 + improv_tol = 10e-3 + for nb_dim in [Int(1e2), Int(1e3), Int(1e4), Int(1e5), Int(1e6)] + @show nb_dim + Random.seed!(1234) + xpi = rand(nb_dim) + total = sum(xpi) + xp = xpi ./ total + d = spzeros(nb_dim) + lmo_prob = FrankWolfe.ProbabilitySimplexOracle(1.0) + x0 = sparse(FrankWolfe.compute_extreme_point(lmo_prob, zeros(nb_dim))) + residual = zeros(nb_dim) + direction = rand(nb_dim) + f(x) = norm(x - xp)^2 + function grad!(storage, x) + @. storage = 2 * (x - xp) + return nothing + end + lmo = FrankWolfe.ChasingGradientLMO(lmo_prob, max_rounds, improv_tol, d, residual) + # @profview test(lmo, direction, x0,xp) + # @profview test(lmo, direction, x0,xp) + @btime test($lmo, $direction, $x0, $xp) + end +end + +bench()