diff --git a/src/deprecations.jl b/src/deprecations.jl index 6148894dbe..0b146f7c05 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -41,13 +41,20 @@ train!(loss, ps::Params, data, opt::Optimisers.AbstractRule; cb=nothing) = error """) train!(loss, model, data, opt::Optimise.AbstractOptimiser; cb=nothing) = - train!(loss, model, data, _old_to_new(opt); cb) + train!(loss, model, data, __old_to_new(opt); cb) # Next, to use the new `setup` with the still-exported old-style `Adam` etc: import .Train: setup -setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model) +setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model) # ... and allow accidental use of `Optimisers.setup` to do the same: -Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model) +Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model) + + +function __old_to_new(rule) + Base.depwarn("""Optimisers from Flux.Optimise module are deprecated. + Use optimisers from Optimisers.jl instead.""", :__old_to_new) + return _old_to_new(rule) +end for T in [:Descent, :Adam, :Momentum, :Nesterov, :AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :RAdam, :OAdam, :AdaBelief, diff --git a/src/layers/conv.jl b/src/layers/conv.jl index 8ba07b95a8..14ed11e319 100644 --- a/src/layers/conv.jl +++ b/src/layers/conv.jl @@ -145,7 +145,7 @@ Conv((3,), 4 => 5, σ) # 65 parameters julia> layer(randn(100, 4, 64)) |> size (98, 5, 64) -julia> Flux.params(layer) |> length +julia> Flux.trainables(layer) |> length 2 ``` """ @@ -294,7 +294,7 @@ ConvTranspose((3,), 5 => 4, σ) # 64 parameters julia> layer(randn(100, 5, 64)) |> size # transposed convolution will increase the dimension size (upsampling) (102, 4, 64) -julia> Flux.params(layer) |> length +julia> Flux.trainables(layer) |> length 2 ``` """ diff --git a/test/ext_cuda/cuda.jl b/test/ext_cuda/cuda.jl index 709cef7aef..4baaa47aea 100644 --- a/test/ext_cuda/cuda.jl +++ b/test/ext_cuda/cuda.jl @@ -19,7 +19,7 @@ using SparseArrays: sparse, SparseMatrixCSC, AbstractSparseArray m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) cm = gpu(m) - @test all(p isa CuArray for p in Flux.params(cm)) + @test all(p isa CuArray for p in Flux.trainables(cm)) @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2} xs = rand(5, 5) diff --git a/test/layers/basic.jl b/test/layers/basic.jl index 95da13f0c9..e4f8b23ea9 100644 --- a/test/layers/basic.jl +++ b/test/layers/basic.jl @@ -40,11 +40,11 @@ using Flux: activations @testset "Activations" begin c = Chain(Dense(3,5,relu), Dense(5,1,relu)) X = Float32.([1.0; 1.0; 1.0]) - @test_nowarn gradient(()->Flux.activations(c, X)[2][1], Flux.params(c)) + @test_nowarn gradient(c -> Flux.activations(c, X)[2][1], c) c2 = Chain(enc = c[1], dec = c[2]) @test Flux.activations(c, X) == Flux.activations(c2, X) - @test_nowarn gradient(()->Flux.activations(c2, X)[2][1], Flux.params(c2)) + @test_nowarn gradient(c -> Flux.activations(c, X)[2][1], c2) end @testset "Dense" begin @@ -156,9 +156,9 @@ using Flux: activations @test mo(input) == target end - @testset "params" begin + @testset "trainables" begin mo = Maxout(()->Dense(32, 64), 4) - ps = Flux.params(mo) + ps = Flux.trainables(mo) @test length(ps) == 8 #4 alts, each with weight and bias end end diff --git a/test/layers/conv.jl b/test/layers/conv.jl index 2e75a1e39d..8780fef957 100644 --- a/test/layers/conv.jl +++ b/test/layers/conv.jl @@ -43,28 +43,30 @@ end @test sum(op) == prod(size(op)) @testset "No bias mapped through $lmap" for lmap in (identity, cpu, f32) - bias = Conv((2,2), 1=>3, bias = false) |> lmap - op = bias(ip) + model = Conv((2,2), 1=>3, bias = false) |> lmap + op = model(ip) @test sum(op) ≈ 0.f0 - gs = gradient(() -> sum(bias(ip)), Flux.params(bias)) - @test bias.bias ∉ gs.params + g = gradient(m -> sum(m(ip)), model)[1] + @test g.bias isa Nothing end - # Train w/o bias and make sure no convergence happens - # when only bias can be converged - bias = Conv((2, 2), 1=>3, bias = false); - ip = zeros(Float32, 28,28,1,1) - op = zeros(Float32, 27,27,3,1) .+ 2.f0 - opt = Descent() - - for _ = 1:10^3 - gs = gradient(Flux.params(bias)) do - Flux.Losses.mse(bias(ip), op) + @testset "no bias train" begin + # Train w/o bias and make sure no convergence happens + # when only bias can be converged + model = Conv((2, 2), 1=>3, bias = false); + ip = zeros(Float32, 28,28,1,1) + op = zeros(Float32, 27,27,3,1) .+ 2.f0 + opt_state = Flux.setup(Descent(), model) + + for _ = 1:10^3 + g = gradient(model) do m + Flux.mse(m(ip), op) + end[1] + Flux.update!(opt_state, model, g) end - Flux.Optimise.update!(opt, params(bias), gs) - end - @test Flux.Losses.mse(bias(ip), op) ≈ 4.f0 + @test Flux.Losses.mse(model(ip), op) ≈ 4.f0 + end @testset "Grouped Conv" begin ip = rand(Float32, 28, 100, 2) @@ -164,11 +166,11 @@ end m = ConvTranspose((3,3), 1=>1) # Test that the gradient call does not throw: #900 - @test gradient(()->sum(m(x)), Flux.params(m)) isa Flux.Zygote.Grads + g = gradient(m -> sum(m(x)), m)[1] x = zeros(Float32, 5, 5, 2, 4) m = ConvTranspose((3,3), 2=>3) - @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads + g = gradient(m -> sum(m(x)), m)[1] # test ConvTranspose supports groups argument x = randn(Float32, 10, 10, 2, 3) @@ -178,7 +180,7 @@ end m2 = ConvTranspose((3,3), 2=>4, groups=2, pad=SamePad()) @test size(m2.weight) == (3,3,2,2) @test size(m1(x)) == size(m2(x)) - @test gradient(()->sum(m2(x)), params(m2)) isa Flux.Zygote.Grads + g = gradient(m -> sum(m(x)), m2)[1] x = randn(Float32, 10, 2,1) m = ConvTranspose((3,), 2=>4, pad=SamePad(), groups=2) diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index be7c5dec92..f678297eaa 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -129,7 +129,7 @@ end 2.0 4.0 6.0] @test Flux.hasaffine(m) == true - @test length(Flux.params(m)) == 2 + @test length(Flux.trainables(m)) == 2 @test m.β == [0, 0] # initβ(2) @test m.γ == [1, 1] # initγ(2) @@ -211,9 +211,9 @@ end @inferred m(x) end - @test length(Flux.params(BatchNorm(10))) == 2 - @test length(Flux.params(BatchNorm(10, affine=true))) == 2 - @test length(Flux.params(BatchNorm(10, affine=false))) == 0 + @test length(Flux.trainables(BatchNorm(10))) == 2 + @test length(Flux.trainables(BatchNorm(10, affine=true))) == 2 + @test length(Flux.trainables(BatchNorm(10, affine=false))) == 0 @test BatchNorm(5; active=true).active === true @test_throws Exception BatchNorm(5; active=:something_else) @@ -224,7 +224,7 @@ end let m = InstanceNorm(2; affine=true, track_stats=true), sizes = (3, 2, 2), x = reshape(collect(1:prod(sizes)), sizes) - @test length(Flux.params(m)) == 2 + @test length(Flux.trainables(m)) == 2 x = Float32.(x) @test m.β == [0, 0] # initβ(2) @test m.γ == [1, 1] # initγ(2) @@ -287,7 +287,7 @@ end x = reshape(collect(1:prod(sizes)), sizes) @test Flux.hasaffine(m) == true - @test length(Flux.params(m)) == 2 + @test length(Flux.trainables(m)) == 2 x = Float64.(x) y = m(x) μ = mean(x, dims=1) @@ -300,7 +300,7 @@ end let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2), x = reshape(collect(1:prod(sizes)), sizes) @test Flux.hasaffine(m) == false - @test length(Flux.params(m)) == 0 + @test length(Flux.trainables(m)) == 0 x = Float64.(x) y = m(x) @@ -345,9 +345,9 @@ end @inferred m(x) end - @test length(Flux.params(InstanceNorm(10))) == 0 - @test length(Flux.params(InstanceNorm(10, affine=true))) == 2 - @test length(Flux.params(InstanceNorm(10, affine=false))) == 0 + @test length(Flux.trainables(InstanceNorm(10))) == 0 + @test length(Flux.trainables(InstanceNorm(10, affine=true))) == 2 + @test length(Flux.trainables(InstanceNorm(10, affine=false))) == 0 @test InstanceNorm(5; active=true).active === true @test_throws Exception InstanceNorm(5; active=:something_else) @@ -370,10 +370,10 @@ end m = LayerNorm((2,3,4)) @test Flux.hasaffine(m) == true - @test length(Flux.params(m)) == 2 + @test length(Flux.trainables(m)) == 2 m = LayerNorm((2,3,4), affine=false) @test Flux.hasaffine(m) == false - @test length(Flux.params(m)) == 0 + @test length(Flux.trainables(m)) == 0 end @testset "GroupNorm" begin @@ -383,7 +383,7 @@ end let m = GroupNorm(4,2), sizes = (3,4,2), x = reshape(collect(1:prod(sizes)), sizes) - @test length(Flux.params(m)) == 2 + @test length(Flux.trainables(m)) == 2 x = Float32.(x) @test m.β == [0, 0, 0, 0] # initβ(32) @test m.γ == [1, 1, 1, 1] # initγ(32) diff --git a/test/layers/recurrent.jl b/test/layers/recurrent.jl index 7df8b0d4c2..35a3b0211f 100644 --- a/test/layers/recurrent.jl +++ b/test/layers/recurrent.jl @@ -1,36 +1,5 @@ using LinearAlgebra -@testset "RNN gradients-implicit" begin - layer = Flux.Recur(Flux.RNNCell(1, 1, identity)) - layer.cell.Wi .= 5.0 - layer.cell.Wh .= 4.0 - layer.cell.b .= 0.0f0 - layer.cell.state0 .= 7.0 - x = [[2.0f0], [3.0f0]] - - # theoretical primal gradients - primal = - layer.cell.Wh .* (layer.cell.Wh * layer.cell.state0 .+ x[1] .* layer.cell.Wi) .+ - x[2] .* layer.cell.Wi - ∇Wi = x[1] .* layer.cell.Wh .+ x[2] - ∇Wh = 2 .* layer.cell.Wh .* layer.cell.state0 .+ x[1] .* layer.cell.Wi - ∇b = layer.cell.Wh .+ 1 - ∇state0 = layer.cell.Wh .^ 2 - - Flux.reset!(layer) - ps = Flux.params(layer) - e, g = Flux.withgradient(ps) do - out = [layer(xi) for xi in x] - sum(out[2]) - end - - @test primal[1] ≈ e - @test ∇Wi ≈ g[ps[1]] - @test ∇Wh ≈ g[ps[2]] - @test ∇b ≈ g[ps[3]] - @test ∇state0 ≈ g[ps[4]] - -end @testset "RNN gradients-explicit" begin layer = Flux.Recur(Flux.RNNCell(1, 1, identity)) @@ -70,9 +39,9 @@ end for r ∈ [RNN,] rnn = r(2 => 3) Flux.reset!(rnn) - grads_seq = gradient(Flux.params(rnn)) do + grads_seq = gradient(rnn) do rnn sum([rnn(s) for s in seq][3]) - end + end[1] Flux.reset!(rnn); bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + Wh * tanh.(rnn.cell.Wi * seq[2] + Wh * @@ -82,7 +51,7 @@ end + rnn.cell.b) + rnn.cell.b)), rnn.cell.Wh) - @test grads_seq[rnn.cell.Wh] ≈ bptt[1] + @test_broken grads_seq.cell.Wh ≈ bptt[1] end end @@ -92,9 +61,9 @@ end for r ∈ [RNN,] rnn = r(2 => 3) Flux.reset!(rnn) - grads_seq = gradient(Flux.params(rnn)) do + grads_seq = gradient(rnn) do rnn sum([rnn(s) for s in seq][3]) - end + end[1] Flux.reset!(rnn); bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + Wh * tanh.(rnn.cell.Wi * seq[2] + Wh * @@ -104,7 +73,7 @@ end + rnn.cell.b) + rnn.cell.b)), rnn.cell.Wh) - @test grads_seq[rnn.cell.Wh] ≈ bptt[1] + @test_broken grads_seq.cell.Wh ≈ bptt[1] end end @@ -112,9 +81,9 @@ end seq = rand(Float32, (2, 1, 3)) rnn = RNN(2 => 3) Flux.reset!(rnn) - grads_seq = gradient(Flux.params(rnn)) do + grads_seq = gradient(rnn) do rnn sum(rnn(seq)[:, :, 3]) - end + end[1] Flux.reset!(rnn); bptt = gradient(rnn.cell.Wh) do Wh # calculate state 1 @@ -131,7 +100,7 @@ end rnn.cell.b) sum(s3) # loss is sum of state 3 end - @test grads_seq[rnn.cell.Wh] ≈ bptt[1] + @test_broken grads_seq.cell.Wh ≈ bptt[1] end @testset "RNN-shapes" begin diff --git a/test/optimise.jl b/test/optimise.jl deleted file mode 100644 index c63ba85727..0000000000 --- a/test/optimise.jl +++ /dev/null @@ -1,222 +0,0 @@ -using Flux.Optimise -using Flux.Optimise: runall -using Flux: Params, gradient -import FillArrays, ComponentArrays -import Optimisers -using Test -using Random - -@testset "Optimise" begin - # Ensure rng has different state inside and outside the inner @testset - # so that w and w' are different - Random.seed!(84) - w = randn(10, 10) - @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(), - NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(), - Nesterov(), RMSProp(), Momentum()] - Random.seed!(42) - w′ = randn(10, 10) - b = false - loss(x) = Flux.Losses.mse(w*x, w′*x .+ b) - for t = 1: 10^5 - θ = params([w′, b]) - x = rand(10) - θ̄ = gradient(() -> loss(x), θ) - Optimise.update!(opt, θ, θ̄) - end - @test loss(rand(10, 10)) < 0.01 - end -end - -@testset "Optimiser" begin - Random.seed!(84) - w = randn(10, 10) - @testset for Opt in [InvDecay, WeightDecay, ExpDecay, SignDecay] - Random.seed!(42) - w′ = randn(10, 10) - loss(x) = Flux.Losses.mse(w*x, w′*x) - opt = Optimiser(Opt(), Adam(0.001)) - for t = 1:10^5 - θ = Params([w′]) - x = rand(10) - θ̄ = gradient(() -> loss(x), θ) - Optimise.update!(opt, θ, θ̄) - end - @test loss(rand(10, 10)) < 0.01 - end -end - -@testset "Training Loop" begin - - # Test multiple callbacks - x = 0 - fs = [() -> (), () -> x = 1] - cbs = runall(fs) - cbs() - @test x == 1 - - r = rand(3, 3) - loss(x) = sum(x .* x) - Flux.train!(loss, Flux.params(r), (r,), Descent()) -end - -@testset "Stop on NaN" begin - m = Dense(1 => 1) - m.weight .= 0 - CNT = 0 - @test_throws DomainError Flux.train!(Flux.params(m), 1:100, Descent(0.1)) do i - CNT += 1 - (i == 51 ? NaN32 : 1f0) * sum(m([1.0])) - end - @test CNT == 51 # stopped early - @test m.weight[1] ≈ -5 # did not corrupt weights -end - -@testset "ExpDecay" begin - - @testset "Sanity Check" begin - o = ExpDecay(0.2, 0.5, 1, 1e-3) - p = [0.0] - steps = 1:8 - eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip) - eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps] - @test eta_actual == eta_expected - end - - @testset "starting step" begin - start = 4 - o = ExpDecay(0.2, 0.5, 1, 1e-3, start) - p = [0.0] - steps = 1:8 - eta_expected = @. max(o.eta * 0.5 ^ max(steps - start, 0), o.clip) - eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps] - @test eta_actual == eta_expected - end - - w = randn(10, 10) - o = ExpDecay(0.1, 0.1, 1000, 1e-4) - w1 = randn(10,10) - loss(x) = Flux.Losses.mse(w*x, w1*x) - flag = 1 - decay_steps = [] - for t = 1:10^5 - prev_eta = o.eta - θ = Params([w1]) - x = rand(10) - θ̄ = gradient(() -> loss(x), θ) - prev_grad = collect(θ̄[w1]) - delta = Optimise.apply!(o, w1, θ̄[w1]) - w1 .-= delta - new_eta = o.eta - if new_eta != prev_eta - push!(decay_steps, t) - end - array = fill(o.eta, size(prev_grad)) - if array .* prev_grad != delta - flag = 0 - end - end - @test flag == 1 - # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1). - ground_truth = [] - for i in 1:4 - push!(ground_truth, 1000*i) # Expected decay steps for this example. - end - @test decay_steps == ground_truth - @test o.eta == o.clip -end - -@testset "Clipping" begin - w = randn(10, 10) - loss(x) = sum(w * x) - θ = Params([w]) - x = 1000 * randn(10) - w̄ = gradient(() -> loss(x), θ)[w] - w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄)) - @test all(w̄_value .<= 1) - w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄)) - @test norm(w̄_norm) <= 1 -end - -@testset "update!: handle Fills from Zygote" begin - w = randn(10,10) - wold = copy(w) - g = FillArrays.Ones(size(w)) - opt = Descent(0.1) - Flux.update!(opt, w, g) - @test w ≈ wold .- 0.1 - - w = randn(3) - wold = copy(w) - θ = Flux.params([w]) - gs = gradient(() -> w[1], θ) - opt = Descent(0.1) - Flux.update!(opt, θ, gs) - @test w[1] ≈ wold[1] .- 0.1 - @test w[2:3] ≈ wold[2:3] - - ## Issue #1510 - w = randn(10,10) - wold = copy(w) - θ = Flux.params([w]) - gs = gradient(() -> sum(w), θ) - opt = Descent(0.1) - Flux.update!(opt, θ, gs) - @test w ≈ wold .- 0.1 -end - -@testset "update!: handle ComponentArrays" begin - w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2])) - wold = deepcopy(w) - opt_state = Optimisers.setup(Optimisers.Descent(0.1), w) - gs = gradient(w -> w.a + sum(w.c.b), w)[1] - Flux.update!(opt_state, w, gs) - @test w.a ≈ wold.a - 0.1 - @test w.b ≈ wold.b - @test w.c.b ≈ wold.c.b .- 0.1 - @test w.c.a ≈ wold.c.a - - w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2])) - wold = deepcopy(w) - opt_state = Optimisers.setup(Optimisers.Descent(0.1), w) - gs = gradient(w -> sum(w), w)[1] - Flux.update!(opt_state, w, gs) - @test w ≈ wold .- 0.1 -end - -# Flux PR #1776 -# We need to test that optimisers like Adam that maintain an internal momentum -# estimate properly calculate the second-order statistics on the gradients as -# the flow backward through the model. Previously, we would calculate second- -# order statistics via `Δ^2` rather than the complex-aware `Δ * conj(Δ)`, which -# wreaks all sorts of havoc on our training loops. This test ensures that -# a simple optimization is montonically decreasing (up to learning step effects) -@testset "Momentum Optimisers and complex values" begin - # Test every optimiser that has momentum internally - for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief] - # Our "model" is just a complex number - w = zeros(ComplexF32, 1) - - # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x` - function loss() - # Deterministic training data is the best training data - x = ones(1, 1) + 1im*ones(1, 1) - - # Manually implement `mse()` to allow demonstration of brokenness - # on older Flux builds that don't have a fixed `mse()` - return sum(abs2.(w * x .- conj(x))) - end - - params = Flux.Params([w]) - opt = opt_ctor(1e-2) - - # Train for 10 iterations, enforcing that loss is monotonically decreasing - last_loss = Inf - for idx in 1:10 - grads = Flux.gradient(loss, params) - @test loss() < last_loss - last_loss = loss() - Flux.update!(opt, params, grads) - end - end -end diff --git a/test/outputsize.jl b/test/outputsize.jl index 55cb823c5c..fe217c0fc9 100644 --- a/test/outputsize.jl +++ b/test/outputsize.jl @@ -248,7 +248,7 @@ end @test string(ld) == "LazyLayer(Dense(2 => 3, relu))" @test Flux.striplazy(ld) isa Dense - @test_throws Exception Flux.params(lm) + @test_throws Exception Flux.trainables(lm) @test_throws Exception gradient(x -> sum(abs2, lm(x)), [1,2]) @test_throws Exception gradient(m -> sum(abs2, Flux.striplazy(m)([1,2])), ld) diff --git a/test/runtests.jl b/test/runtests.jl index ef3d67f4d7..86a9885c90 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -30,8 +30,7 @@ Random.seed!(0) include("loading.jl") end - @testset "Optimise / Train" begin - include("optimise.jl") + @testset "Train" begin include("train.jl") include("tracker.jl") end diff --git a/test/test_utils.jl b/test/test_utils.jl index 004d3035ad..fe52bb30de 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -67,17 +67,9 @@ function gpu_autodiff_test( checkgrad || return ### GRADIENT WITH RESPECT TO INPUT ### - - y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...) - @test check_type(y_cpu) - Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu)) - gs_cpu = back_cpu(Δ_cpu) - - Δ_gpu = Δ_cpu |> gpu - y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...) - @test check_type(y_gpu) - gs_gpu = back_gpu(Δ_gpu) - + y_cpu, gs_cpu = withgradient(x -> sum(f_cpu(x...)), xs_cpu...) + y_gpu, gs_gpu = withgradient(x -> sum(f_gpu(x...)), xs_gpu...) + if test_equal @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu) @@ -86,21 +78,11 @@ function gpu_autodiff_test( end ### GRADIENT WITH RESPECT TO f ### - - ps_cpu = Flux.params(f_cpu) - y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu) - gs_cpu = back_cpu(Δ_cpu) - - ps_gpu = Flux.params(f_gpu) - y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu) - gs_gpu = back_gpu(Δ_gpu) + g_cpu = gradient(f -> sum(f(xs_cpu...)), f_cpu)[1] + g_gpu = gradient(f -> sum(f(xs_gpu...)), f_gpu)[1] if test_equal - @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol - @assert length(ps_gpu) == length(ps_cpu) - for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu) - check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu]; atol, rtol, allow_nothing) - end + check_grad(g_gpu, g_cpu; atol, rtol, allow_nothing) end end diff --git a/test/train.jl b/test/train.jl index 38338c19b9..8824839965 100644 --- a/test/train.jl +++ b/test/train.jl @@ -93,35 +93,25 @@ end x = rand(2) y1 = m(x) # before - # Implicit gradient - gold = Zygote.gradient(() -> m(x), Flux.params(m)) - @test gold isa Flux.Zygote.Grads - @test_throws ErrorException Flux.update!(Flux.Adam(), m, gold) # friendly - Flux.update!(Flux.Adam(), Flux.params(m), gold) - y2 = m(x) - @test y2 < y1 - # Explicit gradient gs = Zygote.gradient(marg -> marg(x), m) @test gs isa Tuple - @test_throws ErrorException Flux.update!(Flux.Adam(), Flux.params(m), gs) # friendly - @test_throws ErrorException Flux.update!(Flux.Adam(), Flux.params(m), gs[1]) # friendly @test_throws ErrorException Flux.update!(Flux.Adam(), m, gs) # friendly @test_throws ErrorException Flux.update!(Flux.Adam(), m, gs[1]) # friendly s = Flux.setup(Adam(), m) @info "ignore this warning, just testing an upgrade path:" Flux.update!(s, m, gs) # Chain + Tuple can be unambiguously sorted out + y2 = m(x) + @test y2 < y1 + Flux.update!(s, m, gs[1]) # finally, this is the correct thing y3 = m(x) @test y3 < y2 - Flux.update!(s, m, gs[1]) # finally, this is the correct thing - y4 = m(x) - @test y4 < y3 # Also check that if you import the new Adam, then Flux.setup does still work! s2 = Flux.setup(Optimisers.Adam(), m) Flux.update!(s2, m, gs[1]) - y5 = m(x) - @test y5 < y4 + y4 = m(x) + @test y4 < y3 end for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme")) @@ -147,28 +137,21 @@ for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme")) end diff1 = model.weight .- init_weight - # Take 2: the same, but with Flux.params. Was broken for a bit, no tests! - # skipping this test for Enzyme cause implicit params is unsupported - if name == "Zygote" - model.weight .= init_weight - model.bias .= 0 - pen2(x::AbstractArray) = sum(abs2, x)/2 - opt = Flux.setup(Adam(0.1), model) - - @test begin - trainfn!(model, data, opt) do m, x, y - err = Flux.mse(m(x), y) - l2 = sum(pen2, Flux.params(m)) - err + 0.33 * l2 - end - - diff2 = model.weight .- init_weight - @test diff1 ≈ diff2 - - true - end broken = VERSION >= v"1.11" + # Take 2: the same, but with Optimisers.trainables. + model.weight .= init_weight + model.bias .= 0 + pen2(x::AbstractArray) = sum(abs2, x)/2 + opt = Flux.setup(Adam(0.1), model) + + trainfn!(model, data, opt) do m, x, y + err = Flux.mse(m(x), y) + l2 = sum(pen2, Flux.trainables(m)) + err + 0.33 * l2 end + diff2 = model.weight .- init_weight + @test diff1 ≈ diff2 + # Take 3: using WeightDecay instead. Need the /2 above, to match exactly. model.weight .= init_weight model.bias .= 0 diff --git a/test/utils.jl b/test/utils.jl index b526b63286..8cdbac2daf 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -250,41 +250,32 @@ end end end -@testset "Params" begin +@testset "Trainables" begin m = Dense(10, 5) - @test size.(params(m)) == [(5, 10), (5,)] + @test size.(Flux.trainables(m)) == [(5, 10), (5,)] m = RNN(10, 5) - @test size.(params(m)) == [(5, 10), (5, 5), (5,), (5, 1)] + @test size.(Flux.trainables(m)) == [(5, 10), (5, 5), (5,), (5, 1)] # Layer duplicated in same chain, params just once pls. c = Chain(m, m) - @test size.(params(c)) == [(5, 10), (5, 5), (5,), (5, 1)] + @test size.(Flux.trainables(c)) == [(5, 10), (5, 5), (5,), (5, 1)] # Self-referential array. Just want params, no stack overflow pls. r = Any[nothing,m] r[1] = r - @test size.(params(r)) == [(5, 10), (5, 5), (5,), (5, 1)] + @test_broken size.(Flux.trainables(r)) == [(5, 10), (5, 5), (5,), (5, 1)] # Ensure functor explores inside Transpose but not SubArray m = (x = view([1,2,3]pi, 1:2), y = transpose([4 5]pi)) - @test size.(Flux.params(m)) == [(2,), (1, 2)] + @test size.(Flux.trainables(m)) == [(2,), (1, 2)] end -@testset "params gradient" begin +@testset "trainables gradient" begin m = (x=[1,2.0], y=[3.0]); - @test begin - # Explicit -- was broken by #2054 / then fixed / now broken again on julia v1.11 - gnew = gradient(m -> (sum(norm, Flux.params(m))), m)[1] - @test gnew.x ≈ [0.4472135954999579, 0.8944271909999159] - @test gnew.y ≈ [1.0] - true - end broken = VERSION >= v"1.11" - - # Implicit - gold = gradient(() -> (sum(norm, Flux.params(m))), Flux.params(m)) - @test gold[m.x] ≈ [0.4472135954999579, 0.8944271909999159] - @test gold[m.y] ≈ [1.0] + gnew = gradient(m -> (sum(norm, Flux.trainables(m))), m)[1] + @test gnew.x ≈ [0.4472135954999579, 0.8944271909999159] + @test gnew.y ≈ [1.0] end @testset "Precision" begin @@ -564,10 +555,10 @@ end @testset "Shared parameters" begin mat = [1 2; 3 4.0] simple = ((nothing, mat, (3, mat, 4))) - @test length(Flux.params(simple)) == 1 + @test length(Flux.trainables(simple)) == 1 oneadj = (nt = (m = mat, a = mat')) - @test length(Flux.params(oneadj)) == 1 # needs Functors@0.3 + @test length(Flux.trainables(oneadj)) == 1 # needs Functors@0.3 @test Flux.destructure(simple)[1] == Flux.destructure(oneadj)[1] == [1, 3, 2, 4] @@ -653,13 +644,13 @@ end model = Model(d, d) # Works - g1 = Flux.gradient(() -> sum(model(x)), Flux.params(model)) + g1 = Flux.gradient(m -> sum(m(x)), model)[1] p, re = Flux.destructure(model) # Fails - g2 = Flux.gradient(p -> sum(re(p)(x)), p) + g2 = Flux.gradient(p -> sum(re(p)(x)), p)[1] - @test g2[1] ≈ vcat(g1[d.weight], g1[d.bias]) + @test g2 ≈ vcat(g1.a.weight + g1.b.weight, g1.a.bias + g1.b.bias) end @testset "issue 1826" begin