rewrote old one as AdaptiveZerothOrder

ZIB-IOL · Nov 8, 2023 · df8616c · df8616c
1 parent c438c31
commit df8616c
Show file tree

Hide file tree

Showing 12 changed files with 207 additions and 56 deletions.
diff --git a/src/alternating_methods.jl b/src/alternating_methods.jl
@@ -92,7 +92,7 @@ function ProjectionFW(y, lmo; max_iter=10000, eps=1e-3)
         epsilon=eps,
         max_iteration=max_iter,
         trajectory=true,
-        line_search=FrankWolfe.Adaptive(verbose=false, relaxed_smoothness=true),
+        line_search=FrankWolfe.Adaptive(verbose=false, relaxed_smoothness=false),
     )
     return x_opt
 end

diff --git a/src/linesearch.jl b/src/linesearch.jl
@@ -333,7 +333,7 @@ It is also the fallback when the Lipschitz constant estimation fails due to nume
 `perform_line_search` also has a `should_upgrade` keyword argument on
 whether there should be a temporary upgrade to `BigFloat` for extended precision.
 """
-mutable struct Adaptive{T,TT} <: LineSearchMethod
+mutable struct AdaptiveZerothOrder{T,TT} <: LineSearchMethod
     eta::T
     tau::TT
     L_est::T
@@ -343,36 +343,36 @@ mutable struct Adaptive{T,TT} <: LineSearchMethod
     relaxed_smoothness::Bool
 end
 
-Adaptive(eta::T, tau::TT) where {T,TT} =
-    Adaptive{T,TT}(eta, tau, T(Inf), T(1e10), T(0.5), true, false)
+AdaptiveZerothOrder(eta::T, tau::TT) where {T,TT} =
+    AdaptiveZerothOrder{T,TT}(eta, tau, T(Inf), T(1e10), T(0.5), true, false)
 
-Adaptive(;
+AdaptiveZerothOrder(;
     eta=0.9,
     tau=2,
     L_est=Inf,
     max_estimate=1e10,
     alpha=0.5,
     verbose=true,
     relaxed_smoothness=false,
-) = Adaptive(eta, tau, L_est, max_estimate, alpha, verbose, relaxed_smoothness)
+) = AdaptiveZerothOrder(eta, tau, L_est, max_estimate, alpha, verbose, relaxed_smoothness)
 
-struct AdaptiveWorkspace{XT,BT}
+struct AdaptiveZerothOrderWorkspace{XT,BT}
     x::XT
     xbig::BT
 end
 
-build_linesearch_workspace(::Adaptive, x, gradient) = AdaptiveWorkspace(similar(x), big.(x))
+build_linesearch_workspace(::AdaptiveZerothOrder, x, gradient) = AdaptiveZerothOrderWorkspace(similar(x), big.(x))
 
 function perform_line_search(
-    line_search::Adaptive,
+    line_search::AdaptiveZerothOrder,
     t,
     f,
     grad!,
     gradient,
     x,
     d,
     gamma_max,
-    storage::AdaptiveWorkspace,
+    storage::AdaptiveZerothOrderWorkspace,
     memory_mode::MemoryEmphasis;
     should_upgrade::Val=Val{false}(),
 )
@@ -401,32 +401,21 @@ function perform_line_search(
 
     gradient_storage = similar(gradient)
 
-    # while f(x_storage) - f(x) >
-    #       -γ * α * dot_dir + α^2 * γ^2 * ndir2 * M / 2 + eps(float(γ)) &&
-    #       γ ≥ 100 * eps(float(γ))
-
-    # # DEPRECATED / remove in future versions
-    #     # Additional smoothness condition
-    #     if line_search.relaxed_smoothness
-    #         grad!(gradient_storage, x_storage)
-    #         if fast_dot(gradient, d) - fast_dot(gradient_storage, d) <= γ * M * ndir2 + eps(float(γ))
-    #             break
-    #         end
-    #     end
-
-    #################
-    # modified adaptive line search test from:
-    # S. Pokutta "The Frank-Wolfe algorith: a short introduction" (2023), preprint
-    # replaces the original test from:
-    # Pedregosa, F., Negiar, G., Askari, A., and Jaggi, M. (2020). "Linearly convergent Frank–Wolfe with backtracking line-search", Proceedings of AISTATS.
-    #################
-    grad!(gradient_storage, x_storage)
-    while 0 > fast_dot(gradient_storage, d) && γ ≥ 100 * eps(float(γ))
-
+    while f(x_storage) - f(x) >
+        -γ * α * dot_dir + α^2 * γ^2 * ndir2 * M / 2 + eps(float(γ)) &&
+        γ ≥ 100 * eps(float(γ))
+
+      # Additional smoothness condition
+      if line_search.relaxed_smoothness
+          grad!(gradient_storage, x_storage)
+          if fast_dot(gradient, d) - fast_dot(gradient_storage, d) <= γ * M * ndir2 + eps(float(γ))
+              break
+          end
+      end
+
         M *= line_search.tau
         γ = min(max(dot_dir / (M * ndir2), 0), gamma_max)
         x_storage = muladd_memory_mode(memory_mode, x_storage, x, γ, d)
-        grad!(gradient_storage, x_storage)
 
         niter += 1
         if M > line_search.max_estimate
@@ -458,7 +447,7 @@ function perform_line_search(
     return γ
 end
 
-Base.print(io::IO, ::Adaptive) = print(io, "Adaptive")
+Base.print(io::IO, ::AdaptiveZerothOrder) = print(io, "AdaptiveZerothOrder")
 
 function _upgrade_accuracy_adaptive(gradient, direction, storage, ::Val{true})
     direction_big = big.(direction)
@@ -473,6 +462,108 @@ function _upgrade_accuracy_adaptive(gradient, direction, storage, ::Val{false})
     return (dot_dir, ndir2, storage.x)
 end
 
+mutable struct Adaptive{T,TT} <: LineSearchMethod
+    eta::T
+    tau::TT
+    L_est::T
+    max_estimate::T
+    verbose::Bool
+    relaxed_smoothness::Bool
+end
+
+Adaptive(eta::T, tau::TT) where {T,TT} =
+    Adaptive{T,TT}(eta, tau, T(Inf), T(1e10), T(0.5), true, false)
+
+Adaptive(;
+    eta=0.9,
+    tau=2,
+    L_est=Inf,
+    max_estimate=1e10,
+    verbose=true,
+    relaxed_smoothness=false,
+) = Adaptive(eta, tau, L_est, max_estimate, verbose, relaxed_smoothness)
+
+struct AdaptiveWorkspace{XT,BT}
+    x::XT
+    xbig::BT
+    gradient_storage::XT
+end
+
+build_linesearch_workspace(::Adaptive, x, gradient) = AdaptiveWorkspace(similar(x), big.(x), similar(x))
+
+function perform_line_search(
+    line_search::Adaptive,
+    t,
+    f,
+    grad!,
+    gradient,
+    x,
+    d,
+    gamma_max,
+    storage::AdaptiveWorkspace,
+    memory_mode::MemoryEmphasis;
+    should_upgrade::Val=Val{false}(),
+)
+    if norm(d) ≤ length(d) * eps(float(eltype(d)))
+        if should_upgrade isa Val{true}
+            return big(zero(promote_type(eltype(d), eltype(gradient))))
+        else
+            return zero(promote_type(eltype(d), eltype(gradient)))
+        end
+    end
+    x_storage = storage.x
+    if !isfinite(line_search.L_est)
+        epsilon_step = min(1e-3, gamma_max)
+        gradient_stepsize_estimation = storage.gradient_storage
+        x_storage = muladd_memory_mode(memory_mode, x_storage, x, epsilon_step, d)
+        grad!(gradient_stepsize_estimation, x_storage)
+        line_search.L_est = norm(gradient - gradient_stepsize_estimation) / (epsilon_step * norm(d))
+    end
+    M = line_search.eta * line_search.L_est
+    (dot_dir, ndir2, x_storage) = _upgrade_accuracy_adaptive(gradient, d, storage, should_upgrade)
+    γ = min(max(dot_dir / (M * ndir2), 0), gamma_max)
+    x_storage = muladd_memory_mode(memory_mode, x_storage, x, γ, d)
+    niter = 0
+    clipping = false
+    gradient_storage = storage.gradient_storage
+
+    grad!(gradient_storage, x_storage)
+    while 0 > fast_dot(gradient_storage, d) && γ ≥ 100 * eps(float(γ))
+        M *= line_search.tau
+        γ = min(max(dot_dir / (M * ndir2), 0), gamma_max)
+        x_storage = muladd_memory_mode(memory_mode, x_storage, x, γ, d)
+        grad!(gradient_storage, x_storage)
+
+        niter += 1
+        if M > line_search.max_estimate
+            # if this occurs, we hit numerical troubles
+            # if we were not using the relaxed smoothness, we try it first as a stable fallback
+            # note that the smoothness estimate is not updated at this iteration.
+            if !line_search.relaxed_smoothness
+                linesearch_fallback = deepcopy(line_search)
+                linesearch_fallback.relaxed_smoothness = true
+                return perform_line_search(
+                    linesearch_fallback, t, f, grad!, gradient, x, d, gamma_max, storage, memory_mode;
+                    should_upgrade=should_upgrade,
+                )
+            end
+            # if we are already in relaxed smoothness, produce a warning:
+            # one might see negative progess, cycling, or stalling.
+            # Potentially upgrade accuracy or use an alternative line search strategy
+            if line_search.verbose
+                @warn "Smoothness estimate run away -> hard clipping. Convergence might be not guaranteed."
+            end
+            clipping = true
+            break
+        end
+    end
+    if !clipping
+        line_search.L_est = M
+    end
+    γ = min(max(dot_dir / (line_search.L_est * ndir2), 0), gamma_max)
+    return γ
+end
+
 """
     MonotonicStepSize{F}
 

diff --git a/test/bcg_direction_error.jl b/test/bcg_direction_error.jl
@@ -30,7 +30,7 @@ x, v, primal, dual_gap, _, _ = FrankWolfe.blended_conditional_gradient(
     lmo,
     x0,
     max_iteration=k,
-    line_search=FrankWolfe.Adaptive(L_est=2.0),
+    line_search=FrankWolfe.AdaptiveZerothOrder(L_est=2.0),
     print_iter=100,
     memory_mode=FrankWolfe.InplaceEmphasis(),
     verbose=true,
@@ -52,7 +52,7 @@ x, v, primal_cut, dual_gap, _, _ = FrankWolfe.blended_conditional_gradient(
     lmo,
     x0,
     max_iteration=k,
-    line_search=FrankWolfe.Adaptive(L_est=2.0),
+    line_search=FrankWolfe.AdaptiveZerothOrder(L_est=2.0),
     print_iter=k / 10,
     memory_mode=FrankWolfe.InplaceEmphasis(),
     verbose=true,

diff --git a/test/blended_accelerated.jl b/test/blended_accelerated.jl
@@ -39,7 +39,7 @@ const L = eigmax(hessian)
             copy(x0),
             epsilon=target_tolerance,
             max_iteration=k,
-            line_search=FrankWolfe.Adaptive(L_est=L),
+            line_search=FrankWolfe.AdaptiveZerothOrder(L_est=L),
             print_iter=k / 10,
             hessian=hessian,
             memory_mode=FrankWolfe.InplaceEmphasis(),
@@ -57,7 +57,7 @@ const L = eigmax(hessian)
             copy(x0),
             epsilon=target_tolerance,
             max_iteration=k,
-            line_search=FrankWolfe.Adaptive(L_est=L),
+            line_search=FrankWolfe.AdaptiveZerothOrder(L_est=L),
             print_iter=k / 10,
             hessian=hessian,
             memory_mode=FrankWolfe.InplaceEmphasis(),
@@ -75,7 +75,7 @@ const L = eigmax(hessian)
             copy(x0),
             epsilon=target_tolerance,
             max_iteration=k,
-            line_search=FrankWolfe.Adaptive(L_est=L),
+            line_search=FrankWolfe.AdaptiveZerothOrder(L_est=L),
             print_iter=k / 10,
             memory_mode=FrankWolfe.InplaceEmphasis(),
             verbose=false,

diff --git a/test/oddities.jl b/test/oddities.jl
@@ -24,6 +24,17 @@ using Test
     ) < 1.0e-10
 
     x0 = copy(x00)
+    @test abs(
+        FrankWolfe.away_frank_wolfe(
+            f,
+            grad!,
+            lmo,
+            x0,
+            max_iteration=1000,
+            line_search=FrankWolfe.AdaptiveZerothOrder(),
+            verbose=true,
+        )[3],
+    ) < 1.0e-10
     @test abs(
         FrankWolfe.away_frank_wolfe(
             f,
@@ -38,6 +49,18 @@ using Test
 
 
     x0 = copy(x00)
+    @test abs(
+        FrankWolfe.away_frank_wolfe(
+            f,
+            grad!,
+            lmo,
+            x0,
+            max_iteration=1000,
+            lazy=true,
+            line_search=FrankWolfe.AdaptiveZerothOrder(),
+            verbose=true,
+        )[3],
+    ) < 1.0e-10
     @test abs(
         FrankWolfe.away_frank_wolfe(
             f,

diff --git a/test/pairwise.jl b/test/pairwise.jl
@@ -23,7 +23,7 @@ end
         lmo_prob,
         x0,
         max_iteration=6000,
-        line_search=FrankWolfe.Adaptive(),
+        line_search=FrankWolfe.AdaptiveZerothOrder(),
         verbose=false,
         epsilon=3e-7,
     )
@@ -33,7 +33,7 @@ end
         lmo_prob,
         x0,
         max_iteration=6000,
-        line_search=FrankWolfe.Adaptive(),
+        line_search=FrankWolfe.AdaptiveZerothOrder(),
         print_iter=100,
         verbose=false,
         epsilon=3e-7,
@@ -46,7 +46,7 @@ end
         lmo_prob,
         x0,
         max_iteration=6000,
-        line_search=FrankWolfe.Adaptive(),
+        line_search=FrankWolfe.AdaptiveZerothOrder(),
         verbose=false,
         lazy=true,
         epsilon=3e-7,
@@ -73,7 +73,7 @@ end
         lmo_prob,
         x0,
         max_iteration=6000,
-        line_search=FrankWolfe.Adaptive(),
+        line_search=FrankWolfe.AdaptiveZerothOrder(),
         verbose=false,
         epsilon=3e-7,
         callback=test_callback,
@@ -96,7 +96,7 @@ end
         lmo,
         x0,
         max_iteration=6000,
-        line_search=FrankWolfe.Adaptive(),
+        line_search=FrankWolfe.AdaptiveZerothOrder(),
         epsilon=3e-7,
         verbose=false,
     )
@@ -109,7 +109,7 @@ end
         lmo,
         x0,
         max_iteration=6000,
-        line_search=FrankWolfe.Adaptive(),
+        line_search=FrankWolfe.AdaptiveZerothOrder(),
         epsilon=3e-7,
         verbose=false,
         recompute_last_vertex=false,

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -185,7 +185,7 @@ end
             lmo_prob,
             x0,
             max_iteration=1000,
-            line_search=FrankWolfe.Adaptive(),
+            line_search=FrankWolfe.AdaptiveZerothOrder(),
             verbose=true,
         )[3] - 0.2,
     ) < 1.0e-5
@@ -444,7 +444,7 @@ end
             lmo,
             x0,
             max_iteration=k,
-            line_search=FrankWolfe.Adaptive(),
+            line_search=FrankWolfe.AdaptiveZerothOrder(),
             print_iter=k / 10,
             memory_mode=FrankWolfe.InplaceEmphasis(),
             verbose=true,
@@ -459,7 +459,7 @@ end
             lmo,
             x0,
             max_iteration=k,
-            line_search=FrankWolfe.Adaptive(),
+            line_search=FrankWolfe.AdaptiveZerothOrder(),
             print_iter=k / 10,
             memory_mode=FrankWolfe.InplaceEmphasis(),
             verbose=true,