FluxML · mcabbott · Feb 12, 2024 · Feb 9, 2024 · Feb 10, 2024 · Feb 12, 2024
diff --git a/Project.toml b/Project.toml
@@ -46,7 +46,7 @@ MacroTools = "0.5"
 Metal = "0.5, 1"
 NNlib = "0.9.1"
 OneHotArrays = "0.2.4"
-Optimisers = "0.2.12, 0.3.0"
+Optimisers = "0.3.2"
 Preferences = "1"
 ProgressLogging = "0.1"
 Reexport = "1.0"

diff --git a/src/Flux.jl b/src/Flux.jl
@@ -45,7 +45,7 @@ using .Optimise
 export Descent, Adam, Momentum, Nesterov, RMSProp,
   AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam,
   AdamW, RAdam, AdaBelief, InvDecay, ExpDecay,
-  WeightDecay, ClipValue, ClipNorm
+  WeightDecay, SignDecay, ClipValue, ClipNorm
 
 export ClipGrad, OptimiserChain  # these are const defined in deprecations, for ClipValue, Optimiser
 

diff --git a/src/deprecations.jl b/src/deprecations.jl
@@ -118,6 +118,7 @@ Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(ru
 for T in [:Descent, :Adam, :Momentum, :Nesterov,
    	      :AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :RAdam, :OAdam, :AdaBelief,
    	      # :InvDecay, :ExpDecay, 
+          :SignDecay,
           ]
   @eval function _old_to_new(rule::$T)
     args = map(f -> getfield(rule, f), fieldnames(Optimisers.$T))
@@ -126,14 +127,19 @@ for T in [:Descent, :Adam, :Momentum, :Nesterov,
 end
 _old_to_new(rule::Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...)
 const OptimiserChain = Optimise.Optimiser  # lets you use new name with implicit params too.
-_old_to_new(rule::WeightDecay) = Optimisers.WeightDecay(rule.wd)  # called gamma now
+_old_to_new(rule::WeightDecay) = Optimisers.WeightDecay(rule.wd)  # called lambda now
 _old_to_new(rule::ClipNorm) = Optimisers.ClipNorm(rule.thresh)  # called omega, and there are more fields 
 _old_to_new(rule::ClipValue) = Optimisers.ClipGrad(rule.thresh)  # called delta now, and struct name differs
 const ClipGrad = Optimise.ClipValue
 _old_to_new(rule::RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon)  # RMSProp has no field centred
 
 _old_to_new(rule) = error("Flux.setup does not know how to translate this old-style implicit rule to a new-style Optimisers.jl explicit rule")
 
+# This allows you to mix and match, like Flux.setup(OptimiserChain(Optimisers.SignDecay(), Flux.Descent()), [1,2,3.])
+Optimisers.OptimiserChain(rules::Union{Optimisers.AbstractRule, Optimise.AbstractOptimiser}...) =
+  Optimisers.OptimiserChain(map(_old_to_new, rules))
+_old_to_new(rule::Optimisers.AbstractRule) = rule
+
 # Since `update!` should be called in a loop, it makes less sense to call `setup` for you if you forgot.
 # But let's make sure that such uses give a helpful error:
 import .Optimise: update!

diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
@@ -6,7 +6,7 @@ export train!, update!,
 	Descent, Adam, Momentum, Nesterov, RMSProp,
 	AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW,RAdam, OAdam, AdaBelief,
 	InvDecay, ExpDecay, WeightDecay, Optimiser,
-	ClipValue, ClipNorm
+	ClipValue, ClipNorm, SignDecay
 
 include("optimisers.jl")
 include("train.jl")

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
@@ -698,6 +698,29 @@ function apply!(o::WeightDecay, x, Δ)
   @. Δ += wd * x
 end
 
+"""
+    SignDecay(λ = 1e-3)
+
+Version of `WeightDecay` which implements ``L_1`` regularisation,
+when composed  with other optimisers as the first transformation to the gradient.
+
+# Examples
+
+```julia
+opt = Optimiser(SignDecay(1e-4), Adam())
+```
+"""
+mutable struct SignDecay <: AbstractOptimiser
+  lambda::Float32
+end
+
+SignDecay() = SignDecay(1f-3)
+
+function apply!(o::SignDecay, x, Δ)
+  λ = o.lambda
+  @. Δ += λ * sign(x)
+end
+
 """
     ClipValue(thresh)
 

diff --git a/test/optimise.jl b/test/optimise.jl
@@ -30,7 +30,7 @@ end
 @testset "Optimiser" begin
   Random.seed!(84)
   w = randn(10, 10)
-  @testset for Opt in [InvDecay, WeightDecay, ExpDecay]
+  @testset for Opt in [InvDecay, WeightDecay, ExpDecay, SignDecay]
     Random.seed!(42)
     w′ = randn(10, 10)
     loss(x) = Flux.Losses.mse(w*x, w′*x)