FluxML · mcabbott · Feb 8, 2024 · Sep 12, 2023 · Oct 9, 2023 · Feb 2, 2024
diff --git a/src/interface.jl b/src/interface.jl
@@ -241,7 +241,8 @@ like this:
 struct Rule
   eta::Float64
   beta::Tuple{Float64, Float64}
-  Rule(eta = 0.1, beta = (0.7, 0.8)) = eta < 0 ? error() : new(eta, beta)
+  Rule(eta, beta = (0.7, 0.8)) = eta < 0 ? error() : new(eta, beta)
+  Rule(; eta = 0.1, beta = (0.7, 0.8)) = Rule(eta, beta)
 end
 ```
 Any field called `eta` is assumed to be a learning rate, and cannot be negative.
@@ -259,12 +260,17 @@ macro def(expr)
     lines[i] = :($name::$typeof($val))
   end
   rule = Meta.isexpr(expr.args[2], :<:) ? expr.args[2].args[1] : expr.args[2]
+  params = [Expr(:kw, nv...) for nv in zip(names,vals)]
   check = :eta in names ? :(eta < 0 && throw(DomainError(eta, "the learning rate cannot be negative"))) : nothing
-  inner = :(function $rule($([Expr(:kw, nv...) for nv in zip(names,vals)]...))
+  # Positional-argument method, has defaults for all but the first arg:
+  inner = :(function $rule($(names[1]), $(params[2:end]...))
     $check
     new($(names...))
   end)
-  push!(lines, inner)
+  # Keyword-argument method. (Made an inner constructor only to allow
+  # resulting structs to be @doc... cannot if macro returns a block.)
+  kwmethod = :($rule(; $(params...)) = $rule($(names...)))
+  push!(lines, inner, kwmethod)
   esc(expr)
 end
 
diff --git a/src/rules.jl b/src/rules.jl
@@ -8,18 +8,19 @@
 
 """
     Descent(η = 1f-1)
+    Descent(; eta)
 
 Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `dp`, this runs `p -= η*dp`.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
 """
 struct Descent{T} <: AbstractRule
   eta::T
 end
-Descent() = Descent(1f-1)
+Descent(; eta = 1f-1) = Descent(eta)
 
 init(o::Descent, x::AbstractArray) = nothing
 
@@ -37,13 +38,14 @@ end
 
 """
     Momentum(η = 0.01, ρ = 0.9)
+    Momentum(; [eta, rho])
 
 Gradient descent optimizer with learning rate `η` and momentum `ρ`.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
-- Momentum (`ρ`): Controls the acceleration of gradient descent in the
+- Momentum (`ρ == rho`): Controls the acceleration of gradient descent in the
                   prominent direction, in effect dampening oscillations.
 """
 @def struct Momentum <: AbstractRule
@@ -89,6 +91,7 @@ end
 
 """
     RMSProp(η = 0.001, ρ = 0.9, ϵ = 1e-8; centred = false)
+    RMSProp(; [eta, rho, epsilon, centred])
 
 Optimizer using the
 [RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
@@ -99,11 +102,11 @@ generally don't need tuning.
 gradients by an estimate their variance, instead of their second moment.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
-- Momentum (`ρ`): Controls the acceleration of gradient descent in the
+- Momentum (`ρ == rho`): Controls the acceleration of gradient descent in the
                   prominent direction, in effect dampening oscillations.
-- Machine epsilon (`ϵ`): Constant to prevent division by zero
+- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero
                          (no need to change default)
 - Keyword `centred` (or `centered`): Indicates whether to use centred variant
                                      of the algorithm.
@@ -115,10 +118,11 @@ struct RMSProp <: AbstractRule
   centred::Bool
 end
 
-function RMSProp(η = 0.001, ρ = 0.9, ϵ = 1e-8; centred::Bool = false, centered::Bool = false)
+function RMSProp(η, ρ = 0.9, ϵ = 1e-8; centred::Bool = false, centered::Bool = false)
   η < 0 && throw(DomainError(η, "the learning rate cannot be negative"))
   RMSProp(η, ρ, ϵ, centred | centered)
 end
+RMSProp(; eta = 0.001, rho = 0.9, epsilon = 1e-8, kw...) = RMSProp(eta, rho, epsilon; kw...)
 
 init(o::RMSProp, x::AbstractArray) = (zero(x), o.centred ? zero(x) : false)
 
@@ -488,22 +492,27 @@ end
 
 """
     AdamW(η = 0.001, β = (0.9, 0.999), λ = 0, ϵ = 1e-8)
+    AdamW(; [eta, beta, lambda, epsilon])
 
 [AdamW](https://arxiv.org/abs/1711.05101) is a variant of Adam fixing (as in repairing) its
 weight decay regularization.
+Implemented as an [`OptimiserChain`](@ref) of [`Adam`](@ref) and [`WeightDecay`](@ref)`.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the
                                    second (β2) momentum estimate.
-- Weight decay (`λ`): Controls the strength of ``L_2`` regularisation.
-- Machine epsilon (`ϵ`): Constant to prevent division by zero
+- Weight decay (`λ == lambda`): Controls the strength of ``L_2`` regularisation.
+- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero
                          (no need to change default)
 """
-AdamW(η = 0.001, β = (0.9, 0.999), λ = 0, ϵ = 1e-8) =
+AdamW(η, β = (0.9, 0.999), λ = 0.0, ϵ = 1e-8) =
   OptimiserChain(Adam(η, β, ϵ), WeightDecay(λ))
 
+AdamW(; eta = 0.001, beta = (0.9, 0.999), lambda = 0, epsilon = 1e-8) =
+  OptimiserChain(Adam(eta, beta, epsilon), WeightDecay(lambda))
+
 """
     AdaBelief(η = 0.001, β = (0.9, 0.999), ϵ = 1e-16)
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -330,6 +330,12 @@ end
       @test_throws ArgumentError Optimisers.thaw!(m)
     end
 
+    @testset "keyword arguments" begin
+      @test Nesterov(rho=0.8, eta=0.1) === Nesterov(0.1, 0.8)
+      @test AdamW(lambda=0.3).opts[1] == Adam()
+      @test AdamW(lambda=0.3).opts[2] == WeightDecay(0.3)
+    end
+
     @testset "forgotten gradient" begin
       x = [1.0, 2.0]
       sx = Optimisers.setup(Descent(), x)