Skip to content

Commit

Permalink
don't test Optimise module
Browse files Browse the repository at this point in the history
remove Flux.params from tests

broken

deprecation in __old_to_new

pen2

remove params entirely

export Optimisers

cleanup

fix ambiguity

comment
  • Loading branch information
CarloLucibello committed Nov 4, 2024
1 parent 7525499 commit 5bdf443
Show file tree
Hide file tree
Showing 18 changed files with 211 additions and 420 deletions.
11 changes: 4 additions & 7 deletions src/Flux.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ using MLUtils
const stack = MLUtils.stack # now exported by Base
import Optimisers: Optimisers, trainable, destructure # before v0.13, Flux owned these functions
using Optimisers: freeze!, thaw!, adjust!, trainables
@reexport using Optimisers

using Random: default_rng
using Zygote, ChainRulesCore
using Zygote: Params, @adjoint, gradient, pullback
Expand Down Expand Up @@ -56,13 +58,8 @@ export Chain, Dense, Embedding, EmbeddingBag,
))

include("optimise/Optimise.jl")
using .Optimise
export Descent, Adam, Momentum, Nesterov, RMSProp,
AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam,
AdamW, RAdam, AdaBelief, InvDecay, ExpDecay,
WeightDecay, SignDecay, ClipValue, ClipNorm

export ClipGrad, OptimiserChain # these are const defined in deprecations, for ClipValue, Optimiser
using .Optimise: Optimise
export ClipValue # this is const defined in deprecations, for ClipGrad

include("train.jl")
using .Train
Expand Down
48 changes: 35 additions & 13 deletions src/deprecations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,31 +41,40 @@ train!(loss, ps::Params, data, opt::Optimisers.AbstractRule; cb=nothing) = error
""")

train!(loss, model, data, opt::Optimise.AbstractOptimiser; cb=nothing) =
train!(loss, model, data, _old_to_new(opt); cb)
train!(loss, model, data, __old_to_new(opt); cb)

# Next, to use the new `setup` with the still-exported old-style `Adam` etc:
import .Train: setup
setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
# ... and allow accidental use of `Optimisers.setup` to do the same:
Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)


function __old_to_new(rule)
Base.depwarn("""Optimisers from Flux.Optimise module are deprecated.
Use optimisers from Optimisers.jl instead.""", :__old_to_new)
return _old_to_new(rule)
end

for T in [:Descent, :Adam, :Momentum, :Nesterov,
:AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :RAdam, :OAdam, :AdaBelief,
# :InvDecay, :ExpDecay,
:SignDecay,
]
@eval function _old_to_new(rule::$T)
@eval function _old_to_new(rule::Optimise.$T)
args = map(f -> getfield(rule, f), fieldnames(Optimisers.$T))
Optimisers.$T(args...)
end
end
_old_to_new(rule::Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...)
const OptimiserChain = Optimise.Optimiser # lets you use new name with implicit params too.
_old_to_new(rule::WeightDecay) = Optimisers.WeightDecay(rule.wd) # called lambda now
_old_to_new(rule::ClipNorm) = Optimisers.ClipNorm(rule.thresh) # called omega, and there are more fields
_old_to_new(rule::ClipValue) = Optimisers.ClipGrad(rule.thresh) # called delta now, and struct name differs
const ClipGrad = Optimise.ClipValue
_old_to_new(rule::RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon) # RMSProp has no field centred
_old_to_new(rule::Optimise.Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...)
# const OptimiserChain = Optimise.Optimiser # lets you use new name with implicit params too.
const Optimiser = Optimisers.OptimiserChain
_old_to_new(rule::Optimise.WeightDecay) = Optimisers.WeightDecay(rule.wd) # called lambda now
_old_to_new(rule::Optimise.ClipNorm) = Optimisers.ClipNorm(rule.thresh) # called omega, and there are more fields
_old_to_new(rule::Optimise.ClipValue) = Optimisers.ClipGrad(rule.thresh) # called delta now, and struct name differs
# const ClipGrad = Optimise.ClipValue
const ClipValue = Optimisers.ClipGrad
_old_to_new(rule::Optimise.RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon) # RMSProp has no field centred

_old_to_new(rule) = error("Flux.setup does not know how to translate this old-style implicit rule to a new-style Optimisers.jl explicit rule")

Expand All @@ -83,8 +92,21 @@ function update!(opt::Optimise.AbstractOptimiser, model, grad)
# to accept only arrays. Remove if this causes problems!
# update!(opt::Flux.Optimise.AbstractOptimiser, x::AbstractArray, x̄)
error("""Invalid input to `update!`.
* For the implicit style, this needs `update(::AbstractOptimiser, ::Params, ::Grads)`
* For the explicit style, `update(state, model, grad)` needs `state = Flux.setup(opt, model)`.
* For the implicit style, this needs `update!(::AbstractOptimiser, ::Params, ::Grads)`
* For the explicit style, `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
""")
end

# TODO this friendly error should go in Optimisers.jl.
# remove after https://github.com/FluxML/Optimisers.jl/pull/181
function update!(opt::Optimisers.AbstractRule, model, grad)
error("""Invalid input to `update!`.
`update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
""")
end
function update!(opt::Optimisers.AbstractRule, model::Chain, grad::Tuple)
error("""Invalid input to `update!`.
`update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
""")
end

Expand Down
4 changes: 2 additions & 2 deletions src/layers/conv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ Conv((3,), 4 => 5, σ) # 65 parameters
julia> layer(randn(100, 4, 64)) |> size
(98, 5, 64)
julia> Flux.params(layer) |> length
julia> Flux.trainables(layer) |> length
2
```
"""
Expand Down Expand Up @@ -294,7 +294,7 @@ ConvTranspose((3,), 5 => 4, σ) # 64 parameters
julia> layer(randn(100, 5, 64)) |> size # transposed convolution will increase the dimension size (upsampling)
(102, 4, 64)
julia> Flux.params(layer) |> length
julia> Flux.trainables(layer) |> length
2
```
"""
Expand Down
10 changes: 5 additions & 5 deletions src/layers/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,15 @@ function _layer_show(io::IO, layer, indent::Int=0, name=nothing)
_str = isnothing(name) ? "" : "$name = "
str = _str * _layer_string(io, layer)
print(io, " "^indent, str, indent==0 ? "" : ",")
if !isempty(params(layer))
if !isempty(trainables(layer))
print(io, " "^max(2, (indent==0 ? 20 : 39) - indent - length(str)))
printstyled(io, "# ", underscorise(sum(length, params(layer); init=0)), " parameters";
printstyled(io, "# ", underscorise(sum(length, trainables(layer); init=0)), " parameters";
color=:light_black)
nonparam = _childarray_sum(length, layer) - sum(length, params(layer), init=0)
nonparam = _childarray_sum(length, layer) - sum(length, trainables(layer), init=0)
if nonparam > 0
printstyled(io, ", plus ", underscorise(nonparam), indent==0 ? " non-trainable" : ""; color=:light_black)
end
_nan_show(io, params(layer))
_nan_show(io, trainables(layer))
end
indent==0 || println(io)
end
Expand All @@ -127,7 +127,7 @@ function _layer_string(::IO, a::AbstractArray)
end

function _big_finale(io::IO, m)
ps = params(m)
ps = trainables(m)
if length(ps) > 2
pars = underscorise(sum(length, ps; init=0))
bytes = Base.format_bytes(Base.summarysize(m))
Expand Down
2 changes: 0 additions & 2 deletions src/outputsize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,6 @@ function ChainRulesCore.rrule(::typeof(striplazy), m)
striplazy(m), _ -> error("striplazy should never be used within a gradient")
end

params!(p::Params, x::LazyLayer, seen = IdSet()) = error("LazyLayer should never be used within params(m). Call striplazy(m) first.")

Functors.functor(::Type{<:LazyLayer}, x) = error("LazyLayer should not be walked with Functors.jl, as the arrays which Flux.gpu wants to move may not exist yet.")

function Base.show(io::IO, l::LazyLayer)
Expand Down
10 changes: 6 additions & 4 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -80,18 +80,20 @@ using Random
# test interaction with `train!`
θ = ones(2)
X = zeros(2, 10)
loss(x) = sum((x .- θ).^2)
loss(θ, x) = sum((x .- θ).^2)
d = DataLoader(X)
Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1))
opt_state = Flux.setup(Descent(0.1), θ)
Flux.train!(loss, θ, ncycle(d, 10), opt_state)
@test norm(θ) < 1e-4

# test interaction with `train!`
θ = zeros(2)
X = ones(2, 10)
Y = fill(2, 10)
loss(x, y) = sum((y - x'*θ).^2)
loss(θ, x, y) = sum((y - x'*θ).^2)
d = DataLoader((X, Y))
Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1))
opt_state = Flux.setup(Descent(0.1), θ)
Flux.train!(loss, θ, ncycle(d, 10), opt_state)
@test norm.- 1) < 1e-10

# specify the rng
Expand Down
2 changes: 1 addition & 1 deletion test/ext_cuda/cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ CUDA.allowscalar(false)
m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
cm = gpu(m)

@test all(p isa CuArray for p in Flux.params(cm))
@test all(p isa CuArray for p in Flux.trainables(cm))
@test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}

xs = rand(5, 5)
Expand Down
55 changes: 55 additions & 0 deletions test/ext_cuda/curnn.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@

@testset "RNN" begin
@testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5)
rnn = R(10, 5)
curnn = rnn |> gpu

Flux.reset!(rnn)
Flux.reset!(curnn)
x = batch_size == 1 ?
rand(Float32, 10) :
rand(Float32, 10, batch_size)
cux = gpu(x)

y, back = pullback((r, x) -> r(x), rnn, x)
cuy, cuback = pullback((r, x) -> r(x), curnn, cux)

@test y collect(cuy)

= randn(size(y))
m̄, x̄ = back(ȳ)
cum̄, cux̄ = cuback(gpu(ȳ))

@test collect(cux̄)
@test m̄[].cell.Wi collect(cum̄[].cell.Wi)
@test m̄[].cell.Wh collect(cum̄[].cell.Wh)
@test m̄[].cell.b collect(cum̄[].cell.b)
if m̄[].state isa Tuple
for (x, cx) in zip(m̄[].state, cum̄[].state)
@test x collect(cx)
end
else
@test m̄[].state collect(cum̄[].state)
end

Flux.reset!(rnn)
Flux.reset!(curnn)
ohx = batch_size == 1 ?
Flux.onehot(rand(1:10), 1:10) :
Flux.onehotbatch(rand(1:10, batch_size), 1:10)
cuohx = gpu(ohx)
y = (rnn(ohx); rnn(ohx))

cuy = (curnn(cuohx); curnn(cuohx))
@test y collect(cuy)

Flux.reset!(rnn)
Flux.reset!(curnn)
fx = rand(Float32, 10, batch_size, 3)
cufx = gpu(fx)
fy = (rnn(fx); rnn(fx))

cufy = (curnn(cufx); curnn(cufx))
@test fy collect(cufy)
end
end
62 changes: 20 additions & 42 deletions test/ext_cuda/layers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -110,17 +110,17 @@ end
l = cl((2,2), 1=>3, bias = false) |> gpu
ip = zeros(Float32, 28,28,1,1) |> gpu
@test sum(l(ip)) 0.f0
gs = gradient(() -> sum(l(ip)), Flux.params(l))
@test l.bias gs.params
gs = gradient(l -> sum(l(ip)), l)[1]
@test gs.bias === nothing
end

@testset "Dense without bias" begin
l = Dense(ones(Float32, 4, 3), false) |> gpu
ip = zeros(Float32, 3, 7) |> gpu

@test sum(l(ip)) 0.f0
gs = gradient(() -> sum(l(ip)), Flux.params(l))
@test l.bias gs.params
gs = gradient(l -> sum(l(ip)), l)[1]
@test gs.bias === nothing
end

@testset "Extended BatchNorm" begin
Expand All @@ -133,13 +133,13 @@ end
μ_cpu = copy(m_cpu.μ)
m_cpu(x_cpu)
@test m_cpu.μ μ_cpu
gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu)
@test !(m_cpu.μ μ_cpu)

μ_gpu = copy(m_gpu.μ)
m_gpu(x_gpu)
@test m_gpu.μ μ_gpu
gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu)
@test !(m_gpu.μ μ_gpu)

@test Array(m_gpu.μ) m_cpu.μ
Expand All @@ -149,14 +149,14 @@ end
μ_cpu = copy(m_cpu.μ)
m_cpu(x_cpu)
@test m_cpu.μ μ_cpu
gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu)
@test m_cpu.μ μ_cpu

testmode!(m_gpu)
μ_gpu = copy(m_gpu.μ)
m_gpu(x_gpu)
@test m_gpu.μ μ_gpu
gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu)
@test m_gpu.μ μ_gpu

## In trainmode, always track statistics
Expand All @@ -165,52 +165,36 @@ end
m_cpu(x_cpu)
@test !(m_cpu.μ μ_cpu)
μ_cpu = copy(m_cpu.μ)
gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu)
@test !(m_cpu.μ μ_cpu)

trainmode!(m_gpu)
μ_gpu = copy(m_gpu.μ)
m_gpu(x_gpu)
@test !(m_gpu.μ μ_gpu)
μ_gpu = copy(m_gpu.μ)
gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu)
@test !(m_gpu.μ μ_gpu)

## No errors if input type mistmatch
# x_cpu = rand(Float64, 3, 2, 2)
# x_gpu = x_cpu |> gpu
# m_cpu(x_cpu)
# gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
# m_gpu(x_gpu)
# gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
end

@testset "Two-streams Bilinear" begin
x = zeros(Float32,10,9) |> gpu
y = zeros(Float32,2,9) |> gpu
b = Flux.Bilinear(10, 2, 3) |> gpu
@test size(b(x,y)) == (3,9)
@test sum(abs2, b(x,y)) 0f0
gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
for (pgpu, pcpu) in zip(params(b), params(b_cpu))
@test gs_cpu[pcpu] Array(gs_gpu[pgpu])
end
@test size(b(x, y)) == (3,9)
@test sum(abs2, b(x, y)) 0f0
test_gradients(b |> cpu, x |> cpu, y |> cpu,
test_gpu=true, compare_finite_diff=false, loss=o -> mean(abs2, o))
end

@testset "Two-streams Bilinear" begin
x = zeros(Float32,10,9) |> gpu
y = zeros(Float32,2,9) |> gpu
b = Flux.Bilinear(10, 2, 3) |> gpu
@test size(b(x,y)) == (3,9)
@test sum(abs2, b(x,y)) 0f0
gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
for (pgpu, pcpu) in zip(params(b), params(b_cpu))
@test gs_cpu[pcpu] Array(gs_gpu[pgpu])
end
@test size(b(x, y)) == (3,9)
@test sum(abs2, b(x, y)) 0f0
test_gradients(b |> cpu, x |> cpu, y |> cpu,
test_gpu=true, compare_finite_diff=false, loss=o -> mean(abs2, o))
end

@testset "Parallel" begin
Expand All @@ -228,15 +212,9 @@ end
end

@testset "gradient" begin
input_cpu = randn(10, 10, 10, 10)
input_gpu = input_cpu |> gpu
layer_cpu = Parallel(+, x -> zero(x), identity)
layer_gpu = layer_cpu |> gpu
gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu))
gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu))
for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu))
@test gs_cpu[pcpu] gs_gpu[pgpu]
end
test_gradients(layer_cpu, randn(5, 5, 5, 5),
test_gpu=true, compare_finite_diff=false, loss=o -> mean(abs2, o))
end
end

Expand Down
Loading

0 comments on commit 5bdf443

Please sign in to comment.