Skip to content

Commit

Permalink
don't test Optimise module
Browse files Browse the repository at this point in the history
remove Flux.params from tests

broken

deprecation in __old_to_new

pen2
  • Loading branch information
CarloLucibello committed Oct 12, 2024
1 parent 09a16ee commit 016dca9
Show file tree
Hide file tree
Showing 13 changed files with 102 additions and 391 deletions.
13 changes: 10 additions & 3 deletions src/deprecations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,20 @@ train!(loss, ps::Params, data, opt::Optimisers.AbstractRule; cb=nothing) = error
""")

train!(loss, model, data, opt::Optimise.AbstractOptimiser; cb=nothing) =
train!(loss, model, data, _old_to_new(opt); cb)
train!(loss, model, data, __old_to_new(opt); cb)

# Next, to use the new `setup` with the still-exported old-style `Adam` etc:
import .Train: setup
setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
# ... and allow accidental use of `Optimisers.setup` to do the same:
Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)


function __old_to_new(rule)
Base.depwarn("""Optimisers from Flux.Optimise module are deprecated.
Use optimisers from Optimisers.jl instead.""", :__old_to_new)
return _old_to_new(rule)
end

for T in [:Descent, :Adam, :Momentum, :Nesterov,
:AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :RAdam, :OAdam, :AdaBelief,
Expand Down
4 changes: 2 additions & 2 deletions src/layers/conv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ Conv((3,), 4 => 5, σ) # 65 parameters
julia> layer(randn(100, 4, 64)) |> size
(98, 5, 64)
julia> Flux.params(layer) |> length
julia> Flux.trainables(layer) |> length
2
```
"""
Expand Down Expand Up @@ -294,7 +294,7 @@ ConvTranspose((3,), 5 => 4, σ) # 64 parameters
julia> layer(randn(100, 5, 64)) |> size # transposed convolution will increase the dimension size (upsampling)
(102, 4, 64)
julia> Flux.params(layer) |> length
julia> Flux.trainables(layer) |> length
2
```
"""
Expand Down
2 changes: 1 addition & 1 deletion test/ext_cuda/cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ using SparseArrays: sparse, SparseMatrixCSC, AbstractSparseArray
m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
cm = gpu(m)

@test all(p isa CuArray for p in Flux.params(cm))
@test all(p isa CuArray for p in Flux.trainables(cm))
@test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}

xs = rand(5, 5)
Expand Down
8 changes: 4 additions & 4 deletions test/layers/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ using Flux: activations
@testset "Activations" begin
c = Chain(Dense(3,5,relu), Dense(5,1,relu))
X = Float32.([1.0; 1.0; 1.0])
@test_nowarn gradient(()->Flux.activations(c, X)[2][1], Flux.params(c))
@test_nowarn gradient(c -> Flux.activations(c, X)[2][1], c)

c2 = Chain(enc = c[1], dec = c[2])
@test Flux.activations(c, X) == Flux.activations(c2, X)
@test_nowarn gradient(()->Flux.activations(c2, X)[2][1], Flux.params(c2))
@test_nowarn gradient(c -> Flux.activations(c, X)[2][1], c2)
end

@testset "Dense" begin
Expand Down Expand Up @@ -156,9 +156,9 @@ using Flux: activations
@test mo(input) == target
end

@testset "params" begin
@testset "trainables" begin
mo = Maxout(()->Dense(32, 64), 4)
ps = Flux.params(mo)
ps = Flux.trainables(mo)
@test length(ps) == 8 #4 alts, each with weight and bias
end
end
Expand Down
42 changes: 22 additions & 20 deletions test/layers/conv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,28 +43,30 @@ end
@test sum(op) == prod(size(op))

@testset "No bias mapped through $lmap" for lmap in (identity, cpu, f32)
bias = Conv((2,2), 1=>3, bias = false) |> lmap
op = bias(ip)
model = Conv((2,2), 1=>3, bias = false) |> lmap
op = model(ip)
@test sum(op) 0.f0
gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
@test bias.bias gs.params
g = gradient(m -> sum(m(ip)), model)[1]
@test g.bias isa Nothing
end

# Train w/o bias and make sure no convergence happens
# when only bias can be converged
bias = Conv((2, 2), 1=>3, bias = false);
ip = zeros(Float32, 28,28,1,1)
op = zeros(Float32, 27,27,3,1) .+ 2.f0
opt = Descent()

for _ = 1:10^3
gs = gradient(Flux.params(bias)) do
Flux.Losses.mse(bias(ip), op)
@testset "no bias train" begin
# Train w/o bias and make sure no convergence happens
# when only bias can be converged
model = Conv((2, 2), 1=>3, bias = false);
ip = zeros(Float32, 28,28,1,1)
op = zeros(Float32, 27,27,3,1) .+ 2.f0
opt_state = Flux.setup(Descent(), model)

for _ = 1:10^3
g = gradient(model) do m
Flux.mse(m(ip), op)
end[1]
Flux.update!(opt_state, model, g)
end
Flux.Optimise.update!(opt, params(bias), gs)
end

@test Flux.Losses.mse(bias(ip), op) 4.f0
@test Flux.Losses.mse(model(ip), op) 4.f0
end

@testset "Grouped Conv" begin
ip = rand(Float32, 28, 100, 2)
Expand Down Expand Up @@ -164,11 +166,11 @@ end

m = ConvTranspose((3,3), 1=>1)
# Test that the gradient call does not throw: #900
@test gradient(()->sum(m(x)), Flux.params(m)) isa Flux.Zygote.Grads
g = gradient(m -> sum(m(x)), m)[1]

x = zeros(Float32, 5, 5, 2, 4)
m = ConvTranspose((3,3), 2=>3)
@test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
g = gradient(m -> sum(m(x)), m)[1]

# test ConvTranspose supports groups argument
x = randn(Float32, 10, 10, 2, 3)
Expand All @@ -178,7 +180,7 @@ end
m2 = ConvTranspose((3,3), 2=>4, groups=2, pad=SamePad())
@test size(m2.weight) == (3,3,2,2)
@test size(m1(x)) == size(m2(x))
@test gradient(()->sum(m2(x)), params(m2)) isa Flux.Zygote.Grads
g = gradient(m -> sum(m(x)), m2)[1]

x = randn(Float32, 10, 2,1)
m = ConvTranspose((3,), 2=>4, pad=SamePad(), groups=2)
Expand Down
26 changes: 13 additions & 13 deletions test/layers/normalisation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ end
2.0 4.0 6.0]

@test Flux.hasaffine(m) == true
@test length(Flux.params(m)) == 2
@test length(Flux.trainables(m)) == 2

@test m.β == [0, 0] # initβ(2)
@test m.γ == [1, 1] # initγ(2)
Expand Down Expand Up @@ -211,9 +211,9 @@ end
@inferred m(x)
end

@test length(Flux.params(BatchNorm(10))) == 2
@test length(Flux.params(BatchNorm(10, affine=true))) == 2
@test length(Flux.params(BatchNorm(10, affine=false))) == 0
@test length(Flux.trainables(BatchNorm(10))) == 2
@test length(Flux.trainables(BatchNorm(10, affine=true))) == 2
@test length(Flux.trainables(BatchNorm(10, affine=false))) == 0

@test BatchNorm(5; active=true).active === true
@test_throws Exception BatchNorm(5; active=:something_else)
Expand All @@ -224,7 +224,7 @@ end
let m = InstanceNorm(2; affine=true, track_stats=true), sizes = (3, 2, 2),
x = reshape(collect(1:prod(sizes)), sizes)

@test length(Flux.params(m)) == 2
@test length(Flux.trainables(m)) == 2
x = Float32.(x)
@test m.β == [0, 0] # initβ(2)
@test m.γ == [1, 1] # initγ(2)
Expand Down Expand Up @@ -287,7 +287,7 @@ end
x = reshape(collect(1:prod(sizes)), sizes)

@test Flux.hasaffine(m) == true
@test length(Flux.params(m)) == 2
@test length(Flux.trainables(m)) == 2
x = Float64.(x)
y = m(x)
μ = mean(x, dims=1)
Expand All @@ -300,7 +300,7 @@ end
let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2),
x = reshape(collect(1:prod(sizes)), sizes)
@test Flux.hasaffine(m) == false
@test length(Flux.params(m)) == 0
@test length(Flux.trainables(m)) == 0

x = Float64.(x)
y = m(x)
Expand Down Expand Up @@ -345,9 +345,9 @@ end
@inferred m(x)
end

@test length(Flux.params(InstanceNorm(10))) == 0
@test length(Flux.params(InstanceNorm(10, affine=true))) == 2
@test length(Flux.params(InstanceNorm(10, affine=false))) == 0
@test length(Flux.trainables(InstanceNorm(10))) == 0
@test length(Flux.trainables(InstanceNorm(10, affine=true))) == 2
@test length(Flux.trainables(InstanceNorm(10, affine=false))) == 0

@test InstanceNorm(5; active=true).active === true
@test_throws Exception InstanceNorm(5; active=:something_else)
Expand All @@ -370,10 +370,10 @@ end

m = LayerNorm((2,3,4))
@test Flux.hasaffine(m) == true
@test length(Flux.params(m)) == 2
@test length(Flux.trainables(m)) == 2
m = LayerNorm((2,3,4), affine=false)
@test Flux.hasaffine(m) == false
@test length(Flux.params(m)) == 0
@test length(Flux.trainables(m)) == 0
end

@testset "GroupNorm" begin
Expand All @@ -383,7 +383,7 @@ end
let m = GroupNorm(4,2), sizes = (3,4,2),
x = reshape(collect(1:prod(sizes)), sizes)

@test length(Flux.params(m)) == 2
@test length(Flux.trainables(m)) == 2
x = Float32.(x)
@test m.β == [0, 0, 0, 0] # initβ(32)
@test m.γ == [1, 1, 1, 1] # initγ(32)
Expand Down
49 changes: 9 additions & 40 deletions test/layers/recurrent.jl
Original file line number Diff line number Diff line change
@@ -1,36 +1,5 @@
using LinearAlgebra

@testset "RNN gradients-implicit" begin
layer = Flux.Recur(Flux.RNNCell(1, 1, identity))
layer.cell.Wi .= 5.0
layer.cell.Wh .= 4.0
layer.cell.b .= 0.0f0
layer.cell.state0 .= 7.0
x = [[2.0f0], [3.0f0]]

# theoretical primal gradients
primal =
layer.cell.Wh .* (layer.cell.Wh * layer.cell.state0 .+ x[1] .* layer.cell.Wi) .+
x[2] .* layer.cell.Wi
∇Wi = x[1] .* layer.cell.Wh .+ x[2]
∇Wh = 2 .* layer.cell.Wh .* layer.cell.state0 .+ x[1] .* layer.cell.Wi
∇b = layer.cell.Wh .+ 1
∇state0 = layer.cell.Wh .^ 2

Flux.reset!(layer)
ps = Flux.params(layer)
e, g = Flux.withgradient(ps) do
out = [layer(xi) for xi in x]
sum(out[2])
end

@test primal[1] e
@test ∇Wi g[ps[1]]
@test ∇Wh g[ps[2]]
@test ∇b g[ps[3]]
@test ∇state0 g[ps[4]]

end

@testset "RNN gradients-explicit" begin
layer = Flux.Recur(Flux.RNNCell(1, 1, identity))
Expand Down Expand Up @@ -70,9 +39,9 @@ end
for r [RNN,]
rnn = r(2 => 3)
Flux.reset!(rnn)
grads_seq = gradient(Flux.params(rnn)) do
grads_seq = gradient(rnn) do rnn
sum([rnn(s) for s in seq][3])
end
end[1]
Flux.reset!(rnn);
bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + Wh *
tanh.(rnn.cell.Wi * seq[2] + Wh *
Expand All @@ -82,7 +51,7 @@ end
+ rnn.cell.b)
+ rnn.cell.b)),
rnn.cell.Wh)
@test grads_seq[rnn.cell.Wh] bptt[1]
@test_broken grads_seq.cell.Wh bptt[1]
end
end

Expand All @@ -92,9 +61,9 @@ end
for r [RNN,]
rnn = r(2 => 3)
Flux.reset!(rnn)
grads_seq = gradient(Flux.params(rnn)) do
grads_seq = gradient(rnn) do rnn
sum([rnn(s) for s in seq][3])
end
end[1]
Flux.reset!(rnn);
bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + Wh *
tanh.(rnn.cell.Wi * seq[2] + Wh *
Expand All @@ -104,17 +73,17 @@ end
+ rnn.cell.b)
+ rnn.cell.b)),
rnn.cell.Wh)
@test grads_seq[rnn.cell.Wh] bptt[1]
@test_broken grads_seq.cell.Wh bptt[1]
end
end

@testset "BPTT-3D" begin
seq = rand(Float32, (2, 1, 3))
rnn = RNN(2 => 3)
Flux.reset!(rnn)
grads_seq = gradient(Flux.params(rnn)) do
grads_seq = gradient(rnn) do rnn
sum(rnn(seq)[:, :, 3])
end
end[1]
Flux.reset!(rnn);
bptt = gradient(rnn.cell.Wh) do Wh
# calculate state 1
Expand All @@ -131,7 +100,7 @@ end
rnn.cell.b)
sum(s3) # loss is sum of state 3
end
@test grads_seq[rnn.cell.Wh] bptt[1]
@test_broken grads_seq.cell.Wh bptt[1]
end

@testset "RNN-shapes" begin
Expand Down
Loading

0 comments on commit 016dca9

Please sign in to comment.