Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Boosting FW #212

Draft
wants to merge 14 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion src/FrankWolfe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ include("fw_algorithms.jl")

# collecting most common data types etc and precompile
# min version req set to 1.5 to prevent stalling of julia 1
@static if VERSION >= v"1.5"
@static if VERSION >= v"1.5"
println("Precompiling common signatures. This might take a moment...")
include("precompile.jl")
end
Expand Down
33 changes: 17 additions & 16 deletions src/afw.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ function away_frank_wolfe(
callback=nothing,
timeout=Inf,
print_callback=print_callback,
kwargs...,
)

# format string for output of the algorithm
Expand Down Expand Up @@ -89,7 +90,7 @@ function away_frank_wolfe(

x = compute_active_set_iterate(active_set)
grad!(gradient, x)
v = compute_extreme_point(lmo, gradient)
v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
phi_value = max(0, fast_dot(x, gradient) - fast_dot(v, gradient))
gamma = 1.0

Expand Down Expand Up @@ -128,14 +129,14 @@ function away_frank_wolfe(
if away_steps
if lazy
d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
lazy_afw_step(x, gradient, lmo, active_set, phi_value; K=K)
lazy_afw_step(x, gradient, lmo, active_set, phi_value; K=K, kwargs...)
else
d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
afw_step(x, gradient, lmo, active_set)
afw_step(x, gradient, lmo, active_set; kwargs...)
end
else
d, vertex, index, gamma_max, phi_value, away_step_taken, fw_step_taken, tt =
fw_step(x, gradient, lmo)
fw_step(x, gradient, lmo; kwargs...)
end

if fw_step_taken || away_step_taken
Expand Down Expand Up @@ -215,7 +216,7 @@ function away_frank_wolfe(
if verbose
x = compute_active_set_iterate(active_set)
grad!(gradient, x)
v = compute_extreme_point(lmo, gradient)
v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
primal = f(x)
dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
tt = last
Expand All @@ -237,7 +238,7 @@ function away_frank_wolfe(
active_set_cleanup!(active_set)
x = compute_active_set_iterate(active_set)
grad!(gradient, x)
v = compute_extreme_point(lmo, gradient)
v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
primal = f(x)
dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
if verbose
Expand All @@ -260,9 +261,9 @@ function away_frank_wolfe(
return x, v, primal, dual_gap, traj_data, active_set
end

function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0, kwargs...)
v_lambda, v, v_loc, a_lambda, a, a_loc = active_set_argminmax(active_set, gradient)
#Do lazy FW step
# Do lazy FW step
grad_dot_lazy_fw_vertex = fast_dot(v, gradient)
grad_dot_x = fast_dot(x, gradient)
grad_dot_a = fast_dot(a, gradient)
Expand All @@ -276,7 +277,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
fw_step_taken = true
index = v_loc
else
#Do away step, as it promises enough progress.
# Do away step, as it promises enough progress.
if grad_dot_a - grad_dot_x > grad_dot_x - grad_dot_lazy_fw_vertex &&
grad_dot_a - grad_dot_x >= phi / K
tt = away
Expand All @@ -286,9 +287,9 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
away_step_taken = true
fw_step_taken = false
index = a_loc
#Resort to calling the LMO
# Resort to calling the LMO
else
v = compute_extreme_point(lmo, gradient)
v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
# Real dual gap promises enough progress.
grad_dot_fw_vertex = fast_dot(v, gradient)
dual_gap = grad_dot_x - grad_dot_fw_vertex
Expand All @@ -300,7 +301,7 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
away_step_taken = false
fw_step_taken = true
index = nothing
#Lower our expectation for progress.
# Lower our expectation for progress.
else
tt = dualstep
phi = min(dual_gap, phi / 2.0)
Expand All @@ -316,11 +317,11 @@ function lazy_afw_step(x, gradient, lmo, active_set, phi; K=2.0)
return d, vertex, index, gamma_max, phi, away_step_taken, fw_step_taken, tt
end

function afw_step(x, gradient, lmo, active_set)
function afw_step(x, gradient, lmo, active_set; kwargs...)
local_v_lambda, local_v, local_v_loc, a_lambda, a, a_loc =
active_set_argminmax(active_set, gradient)
away_gap = fast_dot(a, gradient) - fast_dot(x, gradient)
v = compute_extreme_point(lmo, gradient)
v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
grad_dot_x = fast_dot(x, gradient)
away_gap = fast_dot(a, gradient) - grad_dot_x
dual_gap = grad_dot_x - fast_dot(v, gradient)
Expand All @@ -344,8 +345,8 @@ function afw_step(x, gradient, lmo, active_set)
return d, vertex, index, gamma_max, dual_gap, away_step_taken, fw_step_taken, tt
end

function fw_step(x, gradient, lmo)
vertex = compute_extreme_point(lmo, gradient)
function fw_step(x, gradient, lmo; kwargs...)
vertex = compute_extreme_point(lmo, gradient, x=x; kwargs...)
return (
x - vertex,
vertex,
Expand Down
8 changes: 4 additions & 4 deletions src/blended_cg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ function blended_conditional_gradient(
primal = f(x)
grad!(gradient, x)
# initial gap estimate computation
vmax = compute_extreme_point(lmo, gradient)
vmax = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...)
phi = fast_dot(gradient, x0 - vmax) / 2
dual_gap = phi
traj_data = []
Expand Down Expand Up @@ -243,7 +243,7 @@ function blended_conditional_gradient(
if verbose
x = compute_active_set_iterate(active_set)
grad!(gradient, x)
v = compute_extreme_point(lmo, gradient)
v = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...)
primal = f(x)
dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
tot_time = (time_ns() - time_start) / 1e9
Expand All @@ -267,7 +267,7 @@ function blended_conditional_gradient(
active_set_renormalize!(active_set)
x = compute_active_set_iterate(active_set)
grad!(gradient, x)
v = compute_extreme_point(lmo, gradient)
v = compute_extreme_point(lmo, gradient, x=x; lmo_kwargs...)
primal = f(x)
#dual_gap = 2phi
dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
Expand Down Expand Up @@ -1025,7 +1025,7 @@ function lp_separation_oracle(
end
end
# otherwise, call the LMO
y = compute_extreme_point(lmo, direction; kwargs...)
y = compute_extreme_point(lmo, direction; x=x, kwargs...)
# don't return nothing but y, fast_dot(direction, y) / use y for step outside / and update phi as in LCG (lines 402 - 406)
return (y, fast_dot(direction, y))
end
28 changes: 22 additions & 6 deletions src/fw_algorithms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ function frank_wolfe(
callback=nothing,
timeout=Inf,
print_callback=print_callback,
kwargs...,
)

# format string for output of the algorithm
Expand Down Expand Up @@ -131,7 +132,8 @@ function frank_wolfe(
end
first_iter = false

v = compute_extreme_point(lmo, gradient)
v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
#@show typeof(v)
# go easy on the memory - only compute if really needed
if (
(mod(t, print_iter) == 0 && verbose) ||
Expand Down Expand Up @@ -159,6 +161,9 @@ function frank_wolfe(
step_lim,
one(eltype(x)),
)
#@show typeof(x)
#@show typeof(d)
#@show typeof(x-gamma*d)
if callback !== nothing
state = (
t=t,
Expand Down Expand Up @@ -201,7 +206,9 @@ function frank_wolfe(
# hence the final computation.

grad!(gradient, x)
v = compute_extreme_point(lmo, gradient)
v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
#@show v
#@show typeof(v)
primal = f(x)
dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
if verbose
Expand Down Expand Up @@ -256,6 +263,7 @@ function lazified_conditional_gradient(
callback=nothing,
timeout=Inf,
print_callback=print_callback,
kwargs...,
)

# format string for output of the algorithm
Expand Down Expand Up @@ -351,7 +359,14 @@ function lazified_conditional_gradient(
primal = f(x)
end

v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy)
v = compute_extreme_point(
lmo,
gradient,
threshold=threshold,
greedy=greedy_lazy,
x=x;
kwargs...,
)
tt = lazy
if fast_dot(v, gradient) > threshold
tt = dualstep
Expand Down Expand Up @@ -418,7 +433,7 @@ function lazified_conditional_gradient(
# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
# hence the final computation.
grad!(gradient, x)
v = compute_extreme_point(lmo, gradient)
v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
primal = f(x)
dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)

Expand Down Expand Up @@ -470,6 +485,7 @@ function stochastic_frank_wolfe(
callback=nothing,
timeout=Inf,
print_callback=print_callback,
kwargs...,
)

# format string for output of the algorithm
Expand Down Expand Up @@ -568,7 +584,7 @@ function stochastic_frank_wolfe(
end
first_iter = false

v = compute_extreme_point(lmo, gradient)
v = compute_extreme_point(lmo, gradient, x=x; kwargs...)

# go easy on the memory - only compute if really needed
if (mod(t, print_iter) == 0 && verbose) ||
Expand Down Expand Up @@ -632,7 +648,7 @@ function stochastic_frank_wolfe(
# last computation done with full evaluation for exact gradient

(primal, gradient) = compute_value_gradient(f, x, full_evaluation=true)
v = compute_extreme_point(lmo, gradient)
v = compute_extreme_point(lmo, gradient, x=x; kwargs...)
# @show (gradient, primal)
dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
if verbose
Expand Down
Loading