Skip to content


Export Optimisers and remove params and Optimise from tests (#2495)
Browse files Browse the repository at this point in the history
  • Loading branch information
CarloLucibello authored Nov 5, 2024
1 parent 5dea1af commit e1989b5
Show file tree
Hide file tree
Showing 19 changed files with 164 additions and 426 deletions.
12 changes: 7 additions & 5 deletions
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ See also [github's page]( for a compl

## v0.15.0
* Recurrent layers have undergone a complete redesign in [PR 2500](
* `RNN`, `LSTM`, and `GRU` no longer store the hidden state internally. Instead, they now take the previous state as input and return the updated state as output.
* These layers (`RNN`, `LSTM`, `GRU`) now process entire sequences at once, rather than one element at a time.
* The `Recur` wrapper has been deprecated and removed.
* The `reset!` function has also been removed; state management is now entirely up to the user.
* `RNNCell`, `LSTMCell`, and `GRUCell` are now exported and provide functionality for single time-step processing.
* `RNNCell`, `LSTMCell`, and `GRUCell` are now exported and provide functionality for single time-step processing: `rnncell(x_t, h_t) -> h_{t+1}`.
* `RNN`, `LSTM`, and `GRU` no longer store the hidden state internally, it has to be explicitely passed to the layer. Moreover, they now process entire sequences at once, rather than one element at a time: `rnn(x, h) -> h′`.
* The `Recur` wrapper has been deprecated and removed.
* The `reset!` function has also been removed; state management is now entirely up to the user.
* The `Flux.Optimise` module has been deprecated in favor of the Optimisers.jl package.
Now Flux re-exports the optimisers from Optimisers.jl. Most users will be uneffected by this change.
The module is still available for now, but will be removed in a future release.

## v0.14.22
* Data movement between devices is now provided by [MLDataDevices.jl](
Expand Down
2 changes: 1 addition & 1 deletion docs/src/guide/models/
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ Under the hood, the Flux [`Flux.train!`](@ref) function uses *a loss function* a
julia> using Flux: train!
julia> opt = Descent()
julia> data = [(x_train, y_train)]
1-element Vector{Tuple{Matrix{Int64}, Matrix{Int64}}}:
Expand Down
11 changes: 4 additions & 7 deletions src/Flux.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ using MLUtils
const stack = MLUtils.stack # now exported by Base
import Optimisers: Optimisers, trainable, destructure # before v0.13, Flux owned these functions
using Optimisers: freeze!, thaw!, adjust!, trainables
@reexport using Optimisers

using Random: default_rng
using Zygote, ChainRulesCore
using Zygote: Params, @adjoint, gradient, pullback
Expand Down Expand Up @@ -56,13 +58,8 @@ export Chain, Dense, Embedding, EmbeddingBag,

using .Optimise
export Descent, Adam, Momentum, Nesterov, RMSProp,
AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam,
AdamW, RAdam, AdaBelief, InvDecay, ExpDecay,
WeightDecay, SignDecay, ClipValue, ClipNorm

export ClipGrad, OptimiserChain # these are const defined in deprecations, for ClipValue, Optimiser
using .Optimise: Optimise
export ClipValue # this is const defined in deprecations, for ClipGrad

using .Train
Expand Down
48 changes: 35 additions & 13 deletions src/deprecations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,31 +41,40 @@ train!(loss, ps::Params, data, opt::Optimisers.AbstractRule; cb=nothing) = error

train!(loss, model, data, opt::Optimise.AbstractOptimiser; cb=nothing) =
train!(loss, model, data, _old_to_new(opt); cb)
train!(loss, model, data, __old_to_new(opt); cb)

# Next, to use the new `setup` with the still-exported old-style `Adam` etc:
import .Train: setup
setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
# ... and allow accidental use of `Optimisers.setup` to do the same:
Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)

function __old_to_new(rule)
Base.depwarn("""Optimisers from Flux.Optimise module are deprecated.
Use optimisers from Optimisers.jl instead.""", :__old_to_new)
return _old_to_new(rule)

for T in [:Descent, :Adam, :Momentum, :Nesterov,
:AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :RAdam, :OAdam, :AdaBelief,
# :InvDecay, :ExpDecay,
@eval function _old_to_new(rule::$T)
@eval function _old_to_new(rule::Optimise.$T)
args = map(f -> getfield(rule, f), fieldnames(Optimisers.$T))
_old_to_new(rule::Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...)
const OptimiserChain = Optimise.Optimiser # lets you use new name with implicit params too.
_old_to_new(rule::WeightDecay) = Optimisers.WeightDecay(rule.wd) # called lambda now
_old_to_new(rule::ClipNorm) = Optimisers.ClipNorm(rule.thresh) # called omega, and there are more fields
_old_to_new(rule::ClipValue) = Optimisers.ClipGrad(rule.thresh) # called delta now, and struct name differs
const ClipGrad = Optimise.ClipValue
_old_to_new(rule::RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon) # RMSProp has no field centred
_old_to_new(rule::Optimise.Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...)
# const OptimiserChain = Optimise.Optimiser # lets you use new name with implicit params too.
const Optimiser = Optimisers.OptimiserChain
_old_to_new(rule::Optimise.WeightDecay) = Optimisers.WeightDecay(rule.wd) # called lambda now
_old_to_new(rule::Optimise.ClipNorm) = Optimisers.ClipNorm(rule.thresh) # called omega, and there are more fields
_old_to_new(rule::Optimise.ClipValue) = Optimisers.ClipGrad(rule.thresh) # called delta now, and struct name differs
# const ClipGrad = Optimise.ClipValue
const ClipValue = Optimisers.ClipGrad
_old_to_new(rule::Optimise.RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon) # RMSProp has no field centred

_old_to_new(rule) = error("Flux.setup does not know how to translate this old-style implicit rule to a new-style Optimisers.jl explicit rule")

Expand All @@ -83,8 +92,21 @@ function update!(opt::Optimise.AbstractOptimiser, model, grad)
# to accept only arrays. Remove if this causes problems!
# update!(opt::Flux.Optimise.AbstractOptimiser, x::AbstractArray, x̄)
error("""Invalid input to `update!`.
* For the implicit style, this needs `update(::AbstractOptimiser, ::Params, ::Grads)`
* For the explicit style, `update(state, model, grad)` needs `state = Flux.setup(opt, model)`.
* For the implicit style, this needs `update!(::AbstractOptimiser, ::Params, ::Grads)`
* For the explicit style, `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.

# TODO this friendly error should go in Optimisers.jl.
# remove after
function update!(opt::Optimisers.AbstractRule, model, grad)
error("""Invalid input to `update!`.
`update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
function update!(opt::Optimisers.AbstractRule, model::Chain, grad::Tuple)
error("""Invalid input to `update!`.
`update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.

Expand Down
4 changes: 2 additions & 2 deletions src/layers/conv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ Conv((3,), 4 => 5, σ) # 65 parameters
julia> layer(randn(100, 4, 64)) |> size
(98, 5, 64)
julia> Flux.params(layer) |> length
julia> Flux.trainables(layer) |> length
Expand Down Expand Up @@ -294,7 +294,7 @@ ConvTranspose((3,), 5 => 4, σ) # 64 parameters
julia> layer(randn(100, 5, 64)) |> size # transposed convolution will increase the dimension size (upsampling)
(102, 4, 64)
julia> Flux.params(layer) |> length
julia> Flux.trainables(layer) |> length
Expand Down
10 changes: 5 additions & 5 deletions src/layers/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,15 @@ function _layer_show(io::IO, layer, indent::Int=0, name=nothing)
_str = isnothing(name) ? "" : "$name = "
str = _str * _layer_string(io, layer)
print(io, " "^indent, str, indent==0 ? "" : ",")
if !isempty(params(layer))
if !isempty(trainables(layer))
print(io, " "^max(2, (indent==0 ? 20 : 39) - indent - length(str)))
printstyled(io, "# ", underscorise(sum(length, params(layer); init=0)), " parameters";
printstyled(io, "# ", underscorise(sum(length, trainables(layer); init=0)), " parameters";
nonparam = _childarray_sum(length, layer) - sum(length, params(layer), init=0)
nonparam = _childarray_sum(length, layer) - sum(length, trainables(layer), init=0)
if nonparam > 0
printstyled(io, ", plus ", underscorise(nonparam), indent==0 ? " non-trainable" : ""; color=:light_black)
_nan_show(io, params(layer))
_nan_show(io, trainables(layer))
indent==0 || println(io)
Expand All @@ -127,7 +127,7 @@ function _layer_string(::IO, a::AbstractArray)

function _big_finale(io::IO, m)
ps = params(m)
ps = trainables(m)
if length(ps) > 2
pars = underscorise(sum(length, ps; init=0))
bytes = Base.format_bytes(Base.summarysize(m))
Expand Down
2 changes: 0 additions & 2 deletions src/outputsize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,6 @@ function ChainRulesCore.rrule(::typeof(striplazy), m)
striplazy(m), _ -> error("striplazy should never be used within a gradient")

params!(p::Params, x::LazyLayer, seen = IdSet()) = error("LazyLayer should never be used within params(m). Call striplazy(m) first.")

Functors.functor(::Type{<:LazyLayer}, x) = error("LazyLayer should not be walked with Functors.jl, as the arrays which Flux.gpu wants to move may not exist yet.")

function, l::LazyLayer)
Expand Down
10 changes: 6 additions & 4 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -80,18 +80,20 @@ using Random
# test interaction with `train!`
θ = ones(2)
X = zeros(2, 10)
loss(x) = sum((x .- θ).^2)
loss(θ, x) = sum((x .- θ).^2)
d = DataLoader(X)
Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1))
opt_state = Flux.setup(Descent(0.1), θ)
Flux.train!(loss, θ, ncycle(d, 10), opt_state)
@test norm(θ) < 1e-4

# test interaction with `train!`
θ = zeros(2)
X = ones(2, 10)
Y = fill(2, 10)
loss(x, y) = sum((y - x'*θ).^2)
loss(θ, x, y) = sum((y - x'*θ).^2)
d = DataLoader((X, Y))
Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1))
opt_state = Flux.setup(Descent(0.1), θ)
Flux.train!(loss, θ, ncycle(d, 10), opt_state)
@test norm.- 1) < 1e-10

# specify the rng
Expand Down
2 changes: 1 addition & 1 deletion test/ext_cuda/cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ CUDA.allowscalar(false)
m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
cm = gpu(m)

@test all(p isa CuArray for p in Flux.params(cm))
@test all(p isa CuArray for p in Flux.trainables(cm))
@test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}

xs = rand(5, 5)
Expand Down
62 changes: 20 additions & 42 deletions test/ext_cuda/layers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -110,17 +110,17 @@ end
l = cl((2,2), 1=>3, bias = false) |> gpu
ip = zeros(Float32, 28,28,1,1) |> gpu
@test sum(l(ip)) 0.f0
gs = gradient(() -> sum(l(ip)), Flux.params(l))
@test l.bias gs.params
gs = gradient(l -> sum(l(ip)), l)[1]
@test gs.bias === nothing

@testset "Dense without bias" begin
l = Dense(ones(Float32, 4, 3), false) |> gpu
ip = zeros(Float32, 3, 7) |> gpu

@test sum(l(ip)) 0.f0
gs = gradient(() -> sum(l(ip)), Flux.params(l))
@test l.bias gs.params
gs = gradient(l -> sum(l(ip)), l)[1]
@test gs.bias === nothing

@testset "Extended BatchNorm" begin
Expand All @@ -133,13 +133,13 @@ end
μ_cpu = copy(m_cpu.μ)
@test m_cpu.μ μ_cpu
gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu)
@test !(m_cpu.μ μ_cpu)

μ_gpu = copy(m_gpu.μ)
@test m_gpu.μ μ_gpu
gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu)
@test !(m_gpu.μ μ_gpu)

@test Array(m_gpu.μ) m_cpu.μ
Expand All @@ -149,14 +149,14 @@ end
μ_cpu = copy(m_cpu.μ)
@test m_cpu.μ μ_cpu
gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu)
@test m_cpu.μ μ_cpu

μ_gpu = copy(m_gpu.μ)
@test m_gpu.μ μ_gpu
gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu)
@test m_gpu.μ μ_gpu

## In trainmode, always track statistics
Expand All @@ -165,52 +165,36 @@ end
@test !(m_cpu.μ μ_cpu)
μ_cpu = copy(m_cpu.μ)
gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu)
@test !(m_cpu.μ μ_cpu)

μ_gpu = copy(m_gpu.μ)
@test !(m_gpu.μ μ_gpu)
μ_gpu = copy(m_gpu.μ)
gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu)
@test !(m_gpu.μ μ_gpu)

## No errors if input type mistmatch
# x_cpu = rand(Float64, 3, 2, 2)
# x_gpu = x_cpu |> gpu
# m_cpu(x_cpu)
# gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
# m_gpu(x_gpu)
# gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))

@testset "Two-streams Bilinear" begin
x = zeros(Float32,10,9) |> gpu
y = zeros(Float32,2,9) |> gpu
b = Flux.Bilinear(10, 2, 3) |> gpu
@test size(b(x,y)) == (3,9)
@test sum(abs2, b(x,y)) 0f0
gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
for (pgpu, pcpu) in zip(params(b), params(b_cpu))
@test gs_cpu[pcpu] Array(gs_gpu[pgpu])
@test size(b(x, y)) == (3,9)
@test sum(abs2, b(x, y)) 0f0
test_gradients(b |> cpu, x |> cpu, y |> cpu,
test_gpu=true, compare_finite_diff=false, loss=(m, x, y) -> mean(abs2, m(x, y)))

@testset "Two-streams Bilinear" begin
x = zeros(Float32,10,9) |> gpu
y = zeros(Float32,2,9) |> gpu
b = Flux.Bilinear(10, 2, 3) |> gpu
@test size(b(x,y)) == (3,9)
@test sum(abs2, b(x,y)) 0f0
gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
for (pgpu, pcpu) in zip(params(b), params(b_cpu))
@test gs_cpu[pcpu] Array(gs_gpu[pgpu])
@test size(b(x, y)) == (3,9)
@test sum(abs2, b(x, y)) 0f0
test_gradients(b |> cpu, x |> cpu, y |> cpu,
test_gpu=true, compare_finite_diff=false, loss=(m, x, y) -> mean(abs2, m(x, y)))

@testset "Parallel" begin
Expand All @@ -228,15 +212,9 @@ end

@testset "gradient" begin
input_cpu = randn(10, 10, 10, 10)
input_gpu = input_cpu |> gpu
layer_cpu = Parallel(+, x -> zero(x), identity)
layer_gpu = layer_cpu |> gpu
gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu))
gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu))
for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu))
@test gs_cpu[pcpu] gs_gpu[pgpu]
test_gradients(layer_cpu, randn(2, 2, 2, 2),
test_gpu=true, compare_finite_diff=false, loss=(m, x) -> mean(abs2, m(x)))

Expand Down

0 comments on commit e1989b5

Please sign in to comment.